https://blog.csdn.net/lishk314/article/details/83539622def parse(self, response):
sel = Selector(response)
item = StorageItem()
mainXpath = sel.xpath('//div[@class="map_intro clear"]')
elseXpath = sel.xpath('//div[@class="map_article"]')
item['crawlUrl'] = response.url
item['enterpriseName'] = mainXpath.xpath('dl/dd[1]/text()').extract() #公司名称
item['contactUser'] = mainXpath.xpath('dl/dd[2]/text()').extract() #联系人
item['contactNumber'] = mainXpath.xpath('dl/dd[3]/b/text()').extract() #联系电话
item['warehouseType'] = mainXpath.xpath('dl/dd[4]/text()').extract()#仓库类型
item['releaseTime'] = mainXpath.xpath('dl/dt/span/text()').extract()#发布时间
item['warehouseAddress'] = elseXpath.xpath('div/span/text()').extract() #所在地区
item['warehouseDetailAddr'] = elseXpath.xpath('div/text()[2]').extract() #所在详细地址
sonPath = elseXpath.xpath('table/tbody/tr/td[contains(text(),"仓库规模")]/following-sibling::td[position()=1]')
if not len(sonPath): #空数组
sonPath = elseXpath.xpath('table/tbody/tr/td[contains(text(),"仓库建设方案")]/../following-sibling::tr/td[position()=2]')
item['warehouseSize'] = sonPath.xpath('normalize-space(translate(translate(string(.),"\xa0",""),"平米",""))').extract()
if len(item['enterpriseName']):
yield item
alinkList = sel.xpath('//dd[@class="intro"]/a/@href').extract()
for alink in alinkList:
yield Request(url=alink, callback=self.parse)
---------------------
作者:木子_lishk
来源:CSDN
原文:
https://blog.csdn.net/lishk314/article/details/83539622 版权声明:本文为博主原创文章,转载请附上博文链接!