def parse_detail_page(content):
prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []}
other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water']
# TODO: use the extended fields
b = soup.BeautifulSoup(content)
tables = b.findAll('table', {'class': 'cell'})
if len(tables) > 0:
prop['listing_timestamp'] = datetime.datetime.now()
addr_rows = b.findAll('td', {'class': 'addr'})
addr = ' '.join(map(lambda x: x.getText(), addr_rows))
t = tables[0]
df = pd.read_html(str(t))[0]
data = dict(zip(df[0], df[1]))
prop['raw_address'] = addr
prop['bedrooms'] = int(data['Bedrooms'])
prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths'])
if data.has_key('Interior Sq Ft'):
prop['building_size'] = int(data['Interior Sq Ft'])
prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', ''))
if data.has_key('Parking'):
try:
prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', ''))
except ValueError:
prop['car_spaces'] = -1
#for of in other_fields:
# if data.has_key(of):
# prop['features'].append({of: data[of]})
return [prop]
else:
return None
# Takes a string of the raw version of the page and extracts any links we might want to crawl
评论列表
文章目录