def _parse_doc_protocol(self, committee_id, meeting_id, bucket, protocol_object_name, parts_object_name, text_object_name):
logging.info("parsing doc protocol {} --> {}, {}".format(protocol_object_name, parts_object_name, text_object_name))
with object_storage.temp_download(self.s3, bucket, protocol_object_name) as protocol_filename:
try:
with CommitteeMeetingProtocol.get_from_filename(protocol_filename) as protocol:
object_storage.write(self.s3, bucket, text_object_name, protocol.text, public_bucket=True)
self._parse_protocol_parts(bucket, parts_object_name, protocol)
except (
AntiwordException, # see https://github.com/hasadna/knesset-data-pipelines/issues/15
subprocess.SubprocessError,
xml.etree.ElementTree.ParseError # see https://github.com/hasadna/knesset-data-pipelines/issues/32
):
logging.exception("committee {} meeting {}: failed to parse doc file, skipping".format(committee_id, meeting_id))
return False
return True
parse_committee_meeting_protocols.py 文件源码
python
阅读 23
收藏 0
点赞 0
评论 0
评论列表
文章目录