抽取文本信息(支持doc、docx、pdf)
环境要求:Python3.6 ; pycharmIDE ;anaconda
需要插件:win32com
ps:建议使用---->在cmd中使用python -m pip install pypiwin32进行安装
# coding=utf-8import os, fnmatch # fnmatch 匹配后缀名的包from win32com import client as wcfrom win32com.client import Dispatch, gencachedef Files2txt(filePath, savePath=''):try:# 切分文件目录和文件名dirs, filename = os.path.split(filePath)# 修改转化后的文件名typename = os.path.splitext(filename)[-1].lower()new_name = TranType(filename, typename)# 文件转化后的保存路径if savePath == '':savePath = dirselse:savePath = savePathnew_save_path = os.path.join(savePath, new_name)print('保存路径:', new_save_path)# 加载处理应用manyapp = wc.Dispatch('Word.Application')mytxt = manyapp.Documents.Open(filePath)mytxt.SaveAs(new_save_path, 4)mytxt.Close()print('处理完成,请查看!')except Exception as e:print('程序出错了!')def TranType(filename, typename):'''根据文件后缀修改文件名1、文件名 2、文件类型后缀返回修改后的新的文件名'''new_name = ''if typename == '.pdf':if fnmatch.fnmatch(filename, '*.pdf'):new_name = filename[:-4] + '.txt'else:returnelif typename == '.doc' or typename == '.docx':if fnmatch.fnmatch(filename, '*.doc'):new_name = filename[:-4] + '.txt'elif fnmatch.fnmatch(filename, '*.docx'):new_name = filename[:-5] + '.txt'else:returnelse:print('警告:\n 您输入【', typename, '】不合法,工具仅支持pdf/doc/docx格式,请输入正确的格式。')returnreturn new_nameif __name__ == '__main__':filepath1 = os.path.abspath(r'D:\Python\study\test\抽取文本信息\智联范本.docx')Files2txt(filepath1)
未完待续...