def get_plaintext_document_body(fpath, keep_layout=False):
"""Given a file-path to a full-text, return a list of unicode strings
whereby each string is a line of the fulltext.
In the case of a plain-text document, this simply means reading the
contents in from the file. In the case of a PDF however,
this means converting the document to plaintext.
It raises UnknownDocumentTypeError if the document is not a PDF or
plain text.
@param fpath: (string) - the path to the fulltext file
@return: (list) of strings - each string being a line in the document.
"""
textbody = []
mime_type = magic.from_file(fpath, mime=True)
if mime_type == "text/plain":
with open(fpath, "r") as f:
textbody = [line.decode("utf-8") for line in f.readlines()]
elif mime_type == "application/pdf":
textbody = convert_PDF_to_plaintext(fpath, keep_layout)
else:
raise UnknownDocumentTypeError(mime_type)
return textbody
评论列表
文章目录