def file_parser(fname, pages=None):
if magic.from_file(fname, mime=True) == 'application/pdf':
try:
text_array = []
d = pdf.Document(fname)
for i, p in enumerate(d, start=1):
for f in p:
for b in f:
for l in b:
text_array.append(l.text.encode('UTF-8'))
if i == pages: # break after x pages
break
print "Processed %i pages" % (i)
return '\n'.join(text_array)
except Exception as e:
print "PDF Parser Exception: ", e
else:
try:
content = parser.from_file(fname)['content']
return (content or '').encode('UTF-8')
except Exception as e:
print "File Parser Exception: ", e
评论列表
文章目录