1200字范文,内容丰富有趣,写作的好帮手!
1200字范文 > 机器学习 数据预处理之数据打标签

机器学习 数据预处理之数据打标签

时间:2021-03-28 00:11:36

相关推荐

机器学习 数据预处理之数据打标签

工作内容:

1.读取pdf文档内容

2.分页显示

3.每个数据后设置下拉框供手动打标签

4.数据录入txt文档

代码:

import mathimport osimport tkinter as tkfrom tkinter import *import tkinter.ttk as ttkfrom tkinter.messagebox import *import timeimport pdfplumber as pp# request:pip install pdfplumberclass GUI(object):def __init__(self):print('begin time:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))self.resume_kinds = ('基础资料', '教育经历', '校园经历', '项目经历', '工作经历', '专业技能', '自我评价', '求职意向', '其他')self.file_list = [i for i in os.listdir(r'D:\hk\微信资料\OA测试简历\OA测试简历') if i.endswith('.pdf')]self.content, self.page_num = [], 0self.lb_text, self.cmb = [], []# 主窗口self.root = Tk()self.root.title('简历打标签')self.root.geometry("680x620")# 分页栏self.tab_main = ttk.Notebook()self.tab_main.place(relx=0.05, rely=0.1, relwidth=0.9, relheight=0.8)# 文件下拉框self.cmb_files = bobox(self.root, state='readonly')self.cmb_files['value'] = self.file_listself.cmb_files.current(0)self.cmb_files.place(relx=0.3, rely=0, relwidth=0.3, relheight=0.05)# 文件选择提取按钮self.extract_butt = Button(self.root, text='提取', command=lambda: extract_file(self.cmb_files.get()))self.extract_butt.place(relx=0.61, rely=0, relwidth=0.05, relheight=0.05)# 数据录入按钮self.writein_butt = Button(self.root, text='录入', command=lambda: write_in())self.writein_butt.place(relx=0.5, rely=0.91, relwidth=0.05, relheight=0.05)# 文件选择提取按钮响应函数def extract_file(file):if file.endswith('.pdf'):self.content, self.page_num = [], 0self.lb_text, self.cmb = [], []# 获取文本内容path = 'D:\\hk\\微信资料\\OA测试简历\\OA测试简历\\' + fileprint('path:', path)pdf = pp.open(path)pdf.metadatapages = pdf.pagesfor i in pages:text = i.extract_text()lines = text.splitlines()self.content += linesprint(len(self.content))self.page_num = math.ceil(len(self.content) / 21)print('page_num:', self.page_num)self.tab_main.destroy()self.tab_main = ttk.Notebook()self.tab_main.place(relx=0.05, rely=0.05, relwidth=0.9, relheight=0.85)for i in range(self.page_num):tab = Frame(self.tab_main)tab.pack()labels_area = Frame(tab)cmbs_area = Frame(tab)labels_area.place(relx=0, rely=0, relwidth=0.9, relheight=1)cmbs_area.place(relx=0.9, rely=0, relwidth=0.1, relheight=1)self.tab_main.add(tab, text='%i' % (i + 1))for j in range(21):if i * 21 + j < len(self.content):lb = Label(labels_area, text=self.content[i * 21 + j])# lb.place(relx=0,rely=j/25,relwidth=1,relheight=1/25)lb.pack(anchor=E)self.lb_text.append(self.content[i * 21 + j].replace(' ',''))self.cmb.append(bobox(cmbs_area, state='readonly'))self.cmb[-1]['value'] = self.resume_kindsself.cmb[-1].current(0)self.cmb[-1].pack()# 数据录入按钮响应函数def write_in():confirm = askyesno('提示框', '是否录入数据?(此操作会影响文本录入信息)')if confirm:with open('data.txt', 'a', encoding='utf-8') as f:for i in range(len(self.lb_text)):f.write(self.cmb[i].get() + '' + self.lb_text[i] + '\n')print('录入成功')print('finish time:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))self.root.mainloop()if __name__ == '__main__':gui = GUI()

界面展示:

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。