Answers:
试试PDFMiner。它可以从PDF文件中以HTML,SGML或“标记的PDF”格式提取文本。
Tagged PDF格式似乎是最干净的格式,而去掉XML标签只会留下纯文本。
Python 3版本在以下位置可用:
编辑(再次):
PDFMiner版本已再次更新 20100213
您可以使用以下方法检查已安装的版本:
>>> import pdfminer
>>> pdfminer.__version__
'20100213'
这是更新的版本(带有有关我更改/添加的内容的注释):
def pdf_to_csv(filename):
from cStringIO import StringIO #<-- added so you can copy/paste this to try it
from pdfminer.converter import LTTextItem, TextConverter
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda : {})
for child in self.cur_item.objs:
if isinstance(child, LTTextItem):
(_,_,x,y) = child.bbox #<-- changed
line = lines[int(-y)]
line[x] = child.text.encode(self.codec) #<-- changed
for y in sorted(lines.keys()):
line = lines[y]
self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
self.outfp.write("\n")
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8") #<-- changed
# becuase my test documents are utf-8 (note: utf-8 is the default codec)
doc = PDFDocument()
fp = open(filename, 'rb')
parser = PDFParser(fp) #<-- changed
parser.set_document(doc) #<-- added
doc.set_parser(parser) #<-- added
doc.initialize('')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(doc.get_pages()):
outfp.write("START PAGE %d\n" % i)
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
device.close()
fp.close()
return outfp.getvalue()
编辑(再次):
下面是最新版本的更新的PyPI,20100619p1
。简而言之,我替换LTTextItem
了LTChar
LAParams实例并将其传递给CsvConverter构造函数。
def pdf_to_csv(filename):
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter #<-- changed
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda : {})
for child in self.cur_item.objs:
if isinstance(child, LTChar): #<-- changed
(_,_,x,y) = child.bbox
line = lines[int(-y)]
line[x] = child.text.encode(self.codec)
for y in sorted(lines.keys()):
line = lines[y]
self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
self.outfp.write("\n")
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) #<-- changed
# becuase my test documents are utf-8 (note: utf-8 is the default codec)
doc = PDFDocument()
fp = open(filename, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(doc.get_pages()):
outfp.write("START PAGE %d\n" % i)
if page is not None:
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
device.close()
fp.close()
return outfp.getvalue()
编辑(再过一次):
已更新版本20110515
(感谢Oeufcoque Penteano!):
def pdf_to_csv(filename):
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda : {})
for child in self.cur_item._objs: #<-- changed
if isinstance(child, LTChar):
(_,_,x,y) = child.bbox
line = lines[int(-y)]
line[x] = child._text.encode(self.codec) #<-- changed
for y in sorted(lines.keys()):
line = lines[y]
self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
self.outfp.write("\n")
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
# becuase my test documents are utf-8 (note: utf-8 is the default codec)
doc = PDFDocument()
fp = open(filename, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(doc.get_pages()):
outfp.write("START PAGE %d\n" % i)
if page is not None:
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
device.close()
fp.close()
return outfp.getvalue()
LTTextItem
为LTChar
。 unixuser.org/~euske/python/pdfminer/index.html#changes
20110515
根据您的评论,我在答案的版本中添加了另一部分。
由于这些解决方案都不支持最新版本的PDFMiner,因此我编写了一个简单的解决方案,该解决方案将使用PDFMiner返回pdf文本。这将对那些遇到导入错误的人有用process_pdf
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def pdfparser(data):
fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
print data
if __name__ == '__main__':
pdfparser(sys.argv[1])
请参阅以下适用于Python 3的代码:
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
def pdfparser(data):
fp = open(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
print(data)
if __name__ == '__main__':
pdfparser(sys.argv[1])
python3
,除了在print
命令后带有明显的括号外,还必须将file
命令替换为包open
并StringIO
从中导入io
Pdftotext一个开放源代码程序(Xpdf的一部分),您可以从python调用它(不是您所要求的,但可能有用)。我使用它没有问题。我认为Google在Google桌面中使用它。
-layout
选择使文本保持与PDF相同的位置。现在,只要我能弄清楚如何将PDF内容导入其中。
pdftotext
似乎工作得很好,但如果要在stdout上查看结果,则需要第二个参数连字符。
find . -iname "*.pdf" -exec pdftotext -enc UTF-8 -eol unix -raw {} \;
默认情况下,生成的文件的原始名称为.txt
扩展。
pyPDF可以正常工作(假设您使用的是格式正确的PDF)。如果您只需要文本(带空格),则可以执行以下操作:
import pyPdf
pdf = pyPdf.PdfFileReader(open(filename, "rb"))
for page in pdf.pages:
print page.extractText()
您还可以轻松访问元数据,图像数据等。
extractText代码中的注释说明:
按照在内容流中提供的顺序找到所有文本绘制命令,然后提取文本。这对于某些PDF文件效果很好,但对其他PDF文件效果不佳,具体取决于所使用的生成器。将来会对此进行完善。不要依赖此功能产生的文本顺序,因为如果此功能变得更复杂,它将改变。
这是否是一个问题取决于您对文本的处理方式(例如,顺序无关紧要,就可以了,或者如果生成器按显示顺序将文本添加到流中,就可以了) 。我在日常使用中有pyPdf提取代码,没有任何问题。
您也可以很容易地将pdfminer用作库。您可以访问pdf的内容模型,并且可以创建自己的文本提取。我这样做是使用以下代码将pdf内容转换为以分号分隔的文本。
该函数仅根据TextItem内容对象的y和x坐标对它们进行排序,并输出与一条文本行具有相同y坐标的项目,并用';'将同一行上的对象分隔开 字符。
使用这种方法,我能够从pdf提取文本,而其他工具无法提取适合进一步解析的内容。我尝试过的其他工具包括pdftotext,ps2ascii和在线工具pdftextonline.com。
pdfminer是pdf抓取的宝贵工具。
def pdf_to_csv(filename):
from pdflib.page import TextItem, TextConverter
from pdflib.pdfparser import PDFDocument, PDFParser
from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda : {})
for child in self.cur_item.objs:
if isinstance(child, TextItem):
(_,_,x,y) = child.bbox
line = lines[int(-y)]
line[x] = child.text
for y in sorted(lines.keys()):
line = lines[y]
self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
self.outfp.write("\n")
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, "ascii")
doc = PDFDocument()
fp = open(filename, 'rb')
parser = PDFParser(doc, fp)
doc.initialize('')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(doc.get_pages()):
outfp.write("START PAGE %d\n" % i)
interpreter.process_page(page)
outfp.write("END PAGE %d\n" % i)
device.close()
fp.close()
return outfp.getvalue()
更新:
上面的代码是针对API的旧版本编写的,请参见下面的评论。
pdfminer
,不是pdflib
)。我建议您pdf2txt.py
在PDFminer源代码中查看其源代码,上面的代码受该文件的旧版本启发。
slate
是一个使从库中使用PDFMiner变得非常简单的项目:
>>> with open('example.pdf') as f:
... doc = slate.PDF(f)
...
>>> doc
[..., ..., ...]
>>> doc[1]
'Text from page 2...'
我需要在python模块中将特定的PDF转换为纯文本。在阅读了他们的pdf2txt.py工具后,我使用了PDFMiner 20110515,我编写了以下简单代码段:
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
def to_txt(pdf_path):
input_ = file(pdf_path, 'rb')
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
process_pdf(manager, converter, input_)
return output.getvalue()
C:\Python27\Scripts\pdfminer\tools\pdf2txt.py
重新利用pdfminer随附的pdf2txt.py代码;您可以使函数采用pdf路径;(可选)输出类型(txt | html | xml | tag),并选择类似命令行pdf2txt {'-o':'/path/to/outfile.txt'...}。默认情况下,您可以调用:
convert_pdf(path)
将创建一个文本文件,该文件在文件系统上为原始pdf的同级文件。
def convert_pdf(path, outtype='txt', opts={}):
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfdevice import PDFDevice
from pdfminer.cmapdb import CMapDB
outfile = path[:-3] + outtype
outdir = '/'.join(path.split('/')[:-1])
debug = 0
# input option
password = ''
pagenos = set()
maxpages = 0
# output option
codec = 'utf-8'
pageno = 1
scale = 1
showpageno = True
laparams = LAParams()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-m': maxpages = int(v)
elif k == '-P': password = v
elif k == '-o': outfile = v
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
elif k == '-D': laparams.writing_mode = v
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
#
CMapDB.debug = debug
PDFResourceManager.debug = debug
PDFDocument.debug = debug
PDFParser.debug = debug
PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
#
rsrcmgr = PDFResourceManager()
if not outtype:
outtype = 'txt'
if outfile:
if outfile.endswith('.htm') or outfile.endswith('.html'):
outtype = 'html'
elif outfile.endswith('.xml'):
outtype = 'xml'
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
else:
outfp = sys.stdout
if outtype == 'txt':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
return usage()
fp = file(path, 'rb')
process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password)
fp.close()
device.close()
outfp.close()
return
PDFminer在我尝试使用的pdf文件的每一页上给了我一行[第7页,共1页...]。
到目前为止,我最好的答案是pdftoipe或基于Xpdf的c ++代码。
请参阅我的问题,了解pdftoipe的输出是什么样的。
另外,还有PDFTextStream,这是一个商业Java库,也可以从Python使用。
我用过pdftohtml
这个-xml
参数,用读取结果subprocess.Popen()
,它将为您提供pdf 中每个文本片段的x坐标,y坐标,宽度,高度和字体。我认为这也是“证据”可能使用的原因,因为出现了相同的错误消息。
如果您需要处理柱状数据,由于必须发明一种适合pdf文件的算法,它会变得稍微复杂一些。问题在于制作PDF文件的程序实际上不一定以任何逻辑格式对文本进行布局。您可以尝试使用简单的排序算法,该算法有时会起作用,但是可能很少出现“散乱”和“杂散”的情况,这些文字不会按照您认为的顺序排列。所以你必须要有创造力。
我花了大约5个小时才找到一份我正在研究的pdf文件。但现在效果很好。祝好运。
今天找到了该解决方案。对我来说很棒。甚至将PDF页面呈现为PNG图像。 http://www.swftools.org/gfx_tutorial.html