编辑(再次):
PDFMiner已在版本中再次更新。0213
您可以使用以下内容检查已安装的版本:>>>importpdfminer>>>pdfminer.__version__'0213'
以下是更新的版本(附带关于我更改/添加的内容的注释):defpdf_to_csv(filename):
fromcStringIOimportStringIO#
frompdfminer.converterimportLTTextItem,TextConverter
frompdfminer.pdfparserimportPDFDocument,PDFParser
frompdfminer.pdfinterpimportPDFResourceManager,PDFPageInterpreter
classCsvConverter(TextConverter):
def__init__(self,*args,**kwargs):
TextConverter.__init__(self,*args,**kwargs)
defend_page(self,i):
fromcollectionsimportdefaultdict
lines=defaultdict(lambda:{})
forchildinself.cur_item.objs:
ifisinstance(child,LTTextItem):
(_,_,x,y)=child.bbox#
line=lines[int(-y)]
line[x]=child.text.encode(self.codec)#
foryinsorted(lines.keys()):
line=lines[y]
self.outfp.write(";".join(line[x]forxinsorted(line.keys())))
self.outfp.write("\n")
#...thefollowingpartofthecodeisaremixofthe
#convert()functioninthepdfminer/tools/pdf2textmodule
rsrc=PDFResourceManager()
outfp=StringIO()
device=CsvConverter(rsrc,outfp,codec="utf-8")#
#becuasemytestdocumentsareutf-8(note:utf-8isthedefaultcodec)
doc=PDFDocument()
fp=open(filename,'rb')
parser=PDFParser(fp)#
parser.set_document(doc)#
doc.set_parser(parser)#
doc.initialize('')
interpreter=PDFPageInterpreter(rsrc,device)
fori,pageinenumerate(doc.get_pages()):
outfp.write("STARTPAGE%d\n"%i)
interpreter.process_page(page)
outfp.write("ENDPAGE%d\n"%i)
device.close()
fp.close()
returnoutfp.getvalue()
编辑(再一次):
下面是最新版本的更新皮皮,0619p1..总之我代替了LTTextItem带着LTChar并将LAParams的一个实例传递给Csv转换器构造函数。defpdf_to_csv(filename):
fromcStringIOimportStringIO
frompdfminer.converterimportLTChar,TextConverter#
frompdfminer.layoutimportLAParams
frompdfminer.pdfparserimportPDFDocument,PDFParser
frompdfminer.pdfinterpimportPDFResourceManager,PDFPageInterpreter
classCsvConverter(TextConverter):
def__init__(self,*args,**kwargs):
TextConverter.__init__(self,*args,**kwargs)
defend_page(self,i):
fromcollectionsimportdefaultdict
lines=defaultdict(lambda:{})
forchildinself.cur_item.objs:
ifisinstance(child,LTChar):#
(_,_,x,y)=child.bbox
line=lines[int(-y)]
line[x]=child.text.encode(self.codec)
foryinsorted(lines.keys()):
line=lines[y]
self.outfp.write(";".join(line[x]forxinsorted(line.keys())))
self.outfp.write("\n")
#...thefollowingpartofthecodeisaremixofthe
#convert()functioninthepdfminer/tools/pdf2textmodule
rsrc=PDFResourceManager()
outfp=StringIO()
device=CsvConverter(rsrc,outfp,codec="utf-8",laparams=LAParams())#
#becuasemytestdocumentsareutf-8(note:utf-8isthedefaultcodec)
doc=PDFDocument()
fp=open(filename,'rb')
parser=PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
interpreter=PDFPageInterpreter(rsrc,device)
fori,pageinenumerate(doc.get_pages()):
outfp.write("STARTPAGE%d\n"%i)
ifpageisnotNone:
interpreter.process_page(page)
outfp.write("ENDPAGE%d\n"%i)
device.close()
fp.close()
returnoutfp.getvalue()
编辑(再编辑一次):
更新为版本0515(感谢Oeufcoque Peneano!):defpdf_to_csv(filename):
fromcStringIOimportStringIO
frompdfminer.converterimportLTChar,TextConverter
frompdfminer.layoutimportLAParams
frompdfminer.pdfparserimportPDFDocument,PDFParser
frompdfminer.pdfinterpimportPDFResourceManager,PDFPageInterpreter
classCsvConverter(TextConverter):
def__init__(self,*args,**kwargs):
TextConverter.__init__(self,*args,**kwargs)
defend_page(self,i):
fromcollectionsimportdefaultdict
lines=defaultdict(lambda:{})
forchildinself.cur_item._objs:#
ifisinstance(child,LTChar):
(_,_,x,y)=child.bbox
line=lines[int(-y)]
line[x]=child._text.encode(self.codec)#
foryinsorted(lines.keys()):
line=lines[y]
self.outfp.write(";".join(line[x]forxinsorted(line.keys())))
self.outfp.write("\n")
#...thefollowingpartofthecodeisaremixofthe
#convert()functioninthepdfminer/tools/pdf2textmodule
rsrc=PDFResourceManager()
outfp=StringIO()
device=CsvConverter(rsrc,outfp,codec="utf-8",laparams=LAParams())
#becuasemytestdocumentsareutf-8(note:utf-8isthedefaultcodec)
doc=PDFDocument()
fp=open(filename,'rb')
parser=PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
interpreter=PDFPageInterpreter(rsrc,device)
fori,pageinenumerate(doc.get_pages()):
outfp.write("STARTPAGE%d\n"%i)
ifpageisnotNone:
interpreter.process_page(page)
outfp.write("ENDPAGE%d\n"%i)
device.close()
fp.close()
returnoutfp.getvalue()