example.py 文件源码-python代码片段

def parse_detail_page(content):
    prop = {'raw_address': '', 'bedrooms': -1, 'bathrooms': -1, "size_units": 'I', 'building_size': -1, 'price': -1, 'car_spaces': -1, 'listing_type': 'F', 'features': []}
    other_fields = ['Age', 'Association', 'Basement', 'Cooling', 'Fireplaces', 'Garages', 'Heating', 'Pool', 'Sewer', 'Taxes (Year)', 'Water']
    # TODO: use the extended fields
    b = soup.BeautifulSoup(content)
    tables = b.findAll('table', {'class': 'cell'})
    if len(tables) > 0:
        prop['listing_timestamp'] = datetime.datetime.now()
        addr_rows = b.findAll('td', {'class': 'addr'})
        addr = ' '.join(map(lambda x: x.getText(), addr_rows))
        t = tables[0]
        df = pd.read_html(str(t))[0]
        data = dict(zip(df[0], df[1]))
        prop['raw_address'] = addr
        prop['bedrooms'] = int(data['Bedrooms'])
        prop['bathrooms'] = float(data['Full Baths'] + '.' + data['Partial Baths'])
        if data.has_key('Interior Sq Ft'):
            prop['building_size'] = int(data['Interior Sq Ft'])
        prop['price'] = float(data['Asking Price'].replace('$', '').replace(',', ''))
        if data.has_key('Parking'):
            try:
                prop['car_spaces'] = float(data['Parking'].replace('Cars', '').replace('Car', '').replace(' ', ''))
            except ValueError:
                prop['car_spaces'] = -1
        #for of in other_fields:
        #    if data.has_key(of):
        #        prop['features'].append({of: data[of]})
        return [prop]
    else:
        return None

# Takes a string of the raw version of the page and extracts any links we might want to crawl