def parse_rep_vote_history(self, response):
# Some reps did not vote during a session. Test for the "Vote data is unavailable.
# We capture the base information about the rep for later matching
if "Vote data is unavailable" in response.css("#mainBody::text").extract()[3]:
cur_url = response.url
session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url)
url = cur_url
self.rep_info.append([rep_id, session_id, chamber])
#Otherwise, we process the body of text.
else:
title = response.xpath("""//*[@id="title"]/text()""").extract_first()
rep_title, rep_short_name, rep_district = self.get_name_district(title)
#Fetch the main table - they use nested tables, so have to use a direct reference.
table_rows = response.css('#mainBody > table').extract()[0]
#Parse the html table and select relevant info for the vote.
pd_table = pd.read_html(table_rows, header=0, match="Doc.", attrs={'cellspacing':0})[0][['RCS\xa0#', 'Doc.','Vote','Result']]
#Get session and chamber id from URL and assign to each row
cur_url = response.url
session_id, chamber, rep_id = self.get_session_chamber_rep_id(cur_url)
pd_table['session_id'] = session_id
pd_table['chamber'] = chamber
pd_table['rep_id'] = rep_id
pd_table['rep_title'] = rep_title
pd_table['rep_short_name'] = rep_short_name
pd_table['district'] = rep_district
#Reorder columns
pd_table = pd_table.reindex_axis(['session_id', 'chamber', 'rep_id', 'rep_short_name', 'rep_title', 'district', 'RCS\xa0#', 'Doc.', 'Vote', 'Result'], axis=1)
return pd_table.to_dict(orient='records')
评论列表
文章目录