parse_committee_meeting_protocols.py 文件源码

python
阅读 23 收藏 0 点赞 0 评论 0

项目:knesset-data-pipelines 作者: hasadna 项目源码 文件源码
def _parse_doc_protocol(self, committee_id, meeting_id, bucket, protocol_object_name, parts_object_name, text_object_name):
        logging.info("parsing doc protocol {} --> {}, {}".format(protocol_object_name, parts_object_name, text_object_name))
        with object_storage.temp_download(self.s3, bucket, protocol_object_name) as protocol_filename:
            try:
                with CommitteeMeetingProtocol.get_from_filename(protocol_filename) as protocol:
                    object_storage.write(self.s3, bucket, text_object_name, protocol.text, public_bucket=True)
                    self._parse_protocol_parts(bucket, parts_object_name, protocol)
            except (
                    AntiwordException,  # see https://github.com/hasadna/knesset-data-pipelines/issues/15
                    subprocess.SubprocessError,
                    xml.etree.ElementTree.ParseError  # see https://github.com/hasadna/knesset-data-pipelines/issues/32
            ):
                logging.exception("committee {} meeting {}: failed to parse doc file, skipping".format(committee_id, meeting_id))
                return False
        return True
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号