vhr_spider.py 文件源码-python代码片段

def parse_rep_vote_history(self, response):
        # Some reps did not vote during a session. Test for the "Vote data is unavailable.
        # We capture the base information about the rep for later matching
        if "Vote data is unavailable" in response.css("#mainBody::text").extract()[3]:
            cur_url = response.url
            session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url)
            url = cur_url
            self.rep_info.append([rep_id, session_id, chamber])

        #Otherwise, we process the body of text.
        else:
            title = response.xpath("""//*[@id="title"]/text()""").extract_first()

            rep_title, rep_short_name, rep_district = self.get_name_district(title)
            #Fetch the main table - they use nested tables, so have to use a direct reference.
            table_rows = response.css('#mainBody > table').extract()[0]

            #Parse the html table and select relevant info for the vote.
            pd_table = pd.read_html(table_rows, header=0, match="Doc.", attrs={'cellspacing':0})[0][['RCS\xa0#', 'Doc.','Vote','Result']]

            #Get session and chamber id from URL and assign to each row
            cur_url = response.url
            session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url)
            pd_table['session_id'] = session_id
            pd_table['chamber'] = chamber
            pd_table['rep_id'] = rep_id
            pd_table['rep_title'] = rep_title
            pd_table['rep_short_name'] = rep_short_name
            pd_table['district'] = rep_district

            #Reorder columns
            pd_table = pd_table.reindex_axis(['session_id', 'chamber', 'rep_id', 'rep_short_name', 'rep_title', 'district', 'RCS\xa0#', 'Doc.', 'Vote', 'Result'], axis=1)

            return pd_table.to_dict(orient='records')