宁波妈妈网

回帖：https://blog.csdn.net/lishk314/article/details/83539622

def parse(self, response):
sel = Selector(response)
item = StorageItem()
mainXpath = sel.xpath('//div[@class="map_intro clear"]')
elseXpath = sel.xpath('//div[@class="map_article"]')
item['crawlUrl'] = response.url
item['enterpriseName'] = mainXpath.xpath('dl/dd[1]/text()').extract() #公司名称
item['contactUser'] = mainXpath.xpath('dl/dd[2]/text()').extract() #联系人
item['contactNumber'] = mainXpath.xpath('dl/dd[3]/b/text()').extract() #联系电话
item['warehouseType'] = mainXpath.xpath('dl/dd[4]/text()').extract()#仓库类型
item['releaseTime'] = mainXpath.xpath('dl/dt/span/text()').extract()#发布时间

item['warehouseAddress'] = elseXpath.xpath('div/span/text()').extract() #所在地区
item['warehouseDetailAddr'] = elseXpath.xpath('div/text()[2]').extract() #所在详细地址

sonPath = elseXpath.xpath('table/tbody/tr/td/following-sibling::td')
if not len(sonPath): #空数组
sonPath = elseXpath.xpath('table/tbody/tr/td/../following-sibling::tr/td')
item['warehouseSize'] = sonPath.xpath('normalize-space(translate(translate(string(.),"\xa0",""),"平米",""))').extract()
if len(item['enterpriseName']):
yield item
alinkList = sel.xpath('//dd[@class="intro"]/a/@href').extract()
for alink in alinkList:
yield Request(url=alink, callback=self.parse)
---------------------
作者：木子_lishk
来源：CSDN
原文：https://blog.csdn.net/lishk314/article/details/83539622
版权声明：本文为博主原创文章，转载请附上博文链接！

北斗星回帖于2019-03-28 15:27

下一楼›：#每爬完一个网页会回调parse方法
def parse(self, response):
hx=response.xpath('/ ..(北斗星)
‹上一楼：https://www.jianshu.com/p/09e29b0a4b29(北斗星)

查看全部回帖(10)

«返回主帖