def parse(self, response):
for sel in response.xpath('//*[@id="tb_content"]/div[3]/table/tbody/tr'):
item = WangdaizhijiaItem()
item['pm'] = sel.xpath('td')[0].xpath('span/text()').extract()[0]
item['ptmc'] = sel.xpath('td/a[@target="_blank"]/span/text()').extract()
item['cjl'] = sel.xpath('td/text()').extract()[0]
item['pjll'] = sel.xpath('td/text()').extract()[1]
item['pjjkqx'] = sel.xpath('td/text()').extract()[2]
item['ljdhje'] = sel.xpath('td/text()').extract()[3]
yield item
# pm = scrapy.Field() #??
# ptmc = scrapy.Field() #????
# cjl = scrapy.Field() #???
# pjll = scrapy.Field() #????
# pjjkqx = scrapy.Field() # ??????
# ljdhje = scrapy.Field() #??????
# //*[@id="tb_content"]/div[3]/table/tbody
python类Field()的实例源码
def test_single_item_in_the_feed(self):
class SuperItem(ExtendableItem):
some_field = scrapy.Field()
def __init__(self):
super(SuperItem, self).__init__()
self.rss = RssItem()
for item_name, item in self.items.items():
with CrawlerContext(**self.feed_settings) as context:
context.ipm.process_item(item, context.spider)
with open(self.feed_settings['feed_file']) as data, \
open(os.path.join(os.path.dirname(__file__),
'expected_rss', '{}.rss'.format(item_name))) as expected:
self.assertUnorderedXmlEquivalentOutputs(data=data.read(), expected=expected.read())
super_item = SuperItem()
super_item.rss = item
with CrawlerContext(**self.feed_settings) as context:
context.ipm.process_item(super_item, context.spider)
with open(self.feed_settings['feed_file']) as data, \
open(os.path.join(os.path.dirname(__file__),
'expected_rss', '{}.rss'.format(item_name))) as expected:
self.assertUnorderedXmlEquivalentOutputs(data=data.read(), expected=expected.read())
def parse(self, response):
fp = open('ele' + '.json', 'wb')
fp.write(response.body)
fp.close()
jresult = json.loads(response.body)
items = []
for itemjson in jresult:
name = '??'
recent_order_num = '??'
average_cost = '??'
if 'name' in itemjson:
name = itemjson['name']
if 'recent_order_num' in itemjson:
recent_order_num = itemjson['recent_order_num']
if 'average_cost' in itemjson:
average_cost = itemjson['average_cost']
item = ELEItem(name=name,recent_order_num=recent_order_num,average_cost=average_cost)
# item = ELEItem()
#item.name = scrapy.Field(dict(name=itemjson['name']))
# item.average_cost = itemjson['average_cost']
# item.recent_order_num = itemjson['recent_order_num']
items.append(item)
return items
def test_item_validation(self):
invalid_item = RssItem()
invalid_item.enclosure.url = 'http://example.com/content'
with self.assertRaisesRegexp(InvalidRssItemAttributesError, 'required attributes .*? not set'):
with CrawlerContext(**self.feed_settings) as context:
context.ipm.process_item(invalid_item, context.spider)
class NonStandardElement(ItemElement):
first_attribute = ItemElementAttribute(required=True, is_content=True)
second_attribute = ItemElementAttribute(required=True)
class NonStandardItem(RssItem):
element = NonStandardElement()
invalid_item = NonStandardItem()
with self.assertRaisesRegexp(InvalidElementValueError, 'Could not assign'):
invalid_item.element = 'valid value'
invalid_item.element.first_attribute = 'valid value'
with self.assertRaisesRegexp(InvalidRssItemAttributesError, 'required attributes .*? not set'):
with CrawlerContext(**self.feed_settings) as context:
context.ipm.process_item(invalid_item, context.spider)
class InvalidSuperItem1(ExtendableItem):
pass
class InvalidSuperItem2(ExtendableItem):
field = scrapy.Field()
class InvalidSuperItem3(ExtendableItem):
rss = scrapy.Field()
for invalid_item_cls in (InvalidSuperItem1, InvalidSuperItem2, InvalidSuperItem3):
with self.assertRaisesRegexp(InvalidRssItemError, "Item must have 'rss'"):
with CrawlerContext(**self.feed_settings) as context:
context.ipm.process_item(invalid_item_cls(), context.spider)
def document_to_item(document_class):
class DocumentAsItemClass(Item):
def concrete(self):
return document_class(**self)
exclude_fields = dir(EmptyDocument)
document_fields = [field for field in dir(document_class) if field not in exclude_fields]
for field in document_fields + ['id']:
DocumentAsItemClass.fields[field] = Field()
return DocumentAsItemClass