parse_committee_meeting_protocols.py 文件源码-python代码片段

parse_committee_meeting_protocols.py 文件源码

python

阅读 38 收藏 0 点赞 0 评论 0

项目：knesset-data-pipelines 作者: hasadna 项目源码文件源码

def _parse_doc_protocol(self, committee_id, meeting_id, bucket, protocol_object_name, parts_object_name, text_object_name):
        logging.info("parsing doc protocol {} --> {}, {}".format(protocol_object_name, parts_object_name, text_object_name))
        with object_storage.temp_download(self.s3, bucket, protocol_object_name) as protocol_filename:
            try:
                with CommitteeMeetingProtocol.get_from_filename(protocol_filename) as protocol:
                    object_storage.write(self.s3, bucket, text_object_name, protocol.text, public_bucket=True)
                    self._parse_protocol_parts(bucket, parts_object_name, protocol)
            except (
                    AntiwordException,  # see https://github.com/hasadna/knesset-data-pipelines/issues/15
                    subprocess.SubprocessError,
                    xml.etree.ElementTree.ParseError  # see https://github.com/hasadna/knesset-data-pipelines/issues/32
            ):
                logging.exception("committee {} meeting {}: failed to parse doc file, skipping".format(committee_id, meeting_id))
                return False
        return True