python 2.7 - Unable to get the text from a span using splash-scrapy -
i trying few examples scrape following name, description,price , image:
http://www.lucy.com/shop/shop-by-activity-running/extra-mile-top-0a2s96?variationid=005
i facing 2 problems here:
a) able name , description correctly price not correctly getting scrapped. values receive price following:
'selling_price': [u'\xa0'],
'selling_price': [u'$35.00'],
b) don't know how can extract image there since canvas element shows image when inspect in firebug. maybe need use javascript struggling with.
following spider have created:
# -*- coding: utf-8 -*- import scrapy import urllib2 import datetime import simplejson json scrapy.spiders import crawlspider dashcrawler.items import dashcrawleritem dashcrawler import constant class lucyspider(crawlspider): name = "lucy" custom_settings = { "dupefilter_class": 'dashcrawler.middlewares.customfilter', "downloader_clientcontextfactory": 'dashcrawler.context.customcontextfactory', # "user_agent": "mozilla/5.0 (x11; linux x86_64) applewebkit/537.36 (khtml, gecko) chrome/49.0.2623.75 safari/537.36", "retry_times": 4, "retry_http_codes": [504, 400], "concurrent_requests": 1, 'downloader_middlewares': { 'scrapy.downloadermiddlewares.useragent.useragentmiddleware': none, 'scrapy.downloadermiddlewares.httpproxy.httpproxymiddleware': 110, # 'scrapy_fake_useragent.middleware.randomuseragentmiddleware': 400, 'scrapy_splash.splashcookiesmiddleware': 723, 'scrapy_splash.splashmiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.httpcompressionmiddleware': 810, 'scrapy.downloadermiddlewares.httpauth.httpauthmiddleware': 725, } } constant = constant.constants_by_spider(name) http_user = constant['http_user'] http_pass = constant['http_pass'] proxy_ip = constant['proxy_ip'] proxy_auth = constant['proxy_auth'] gender_types = constant['gender_types'] no_result_found = "nenhum resultado encontrado para consulta" availability = "continuar com compra" scrapy_details = constant['scrapy_details'] allowed_domains = constant['allowed_domains'] api_url_call = constant['api_url_call'] count = constant['login_types'] zipcodes_count = 0 start_urls = [] accounts = [] splash_source = """function main(splash) local url = splash.args.url assert(splash:go(url)) assert(splash:wait(1.0)) return splash:html() end""" def start_requests(self): request = urllib2.request(self.api_url_call, headers=self.constant['headers']) retailer_product_configurations = json.load(urllib2.urlopen(request)) self.scrapy_details = retailer_product_configurations login_type = self.scrapy_details["shopper_type"] is_register = "registered" in login_type self.accounts = self.scrapy_details["accounts"] retailer_product_configurations = self.scrapy_details['retailerproductconfigurationurls'] anonymous_user_types = set(["anonymous", "both"]) & set(login_type) #import pudb; pudb.set_trace() if (len(anonymous_user_types) >= 0 , is_register == false) or (len(self.accounts) == 0): yield scrapy.request( self.api_url_call, callback=self.main_url_parse, headers=self.constant['headers'], meta={ "shopper_type": "anonymous", "retailer_product_configurations": retailer_product_configurations, "index": 0, }, errback=self.error ) def main_url_parse(self, response): product_store = [] index = response.meta['index'] retailer_product_configurations = response.meta['retailer_product_configurations'] shopper_type = response.meta['shopper_type'] self.request_count = len(retailer_product_configurations) # index = 0 retailer_product_configuration in retailer_product_configurations: url = retailer_product_configuration['url'] page_type = retailer_product_configuration['urltype'] retailer_id = retailer_product_configuration['retailerid'] max_number_of_records = retailer_product_configuration['maxnumberofrecords'] retailer_product_configuration_id = retailer_product_configuration['id'] product_url = "self.color_url" if page_type == "pdp" else "self.parse" product_store.append(url) index += 1 colorvariant = retailer_product_configuration.get('colorvariant', true) colorvariant = true if colorvariant == none else colorvariant yield scrapy.request( url, callback= self.parse, meta={"product_store": product_store, "start_url": url, "shopper_type": shopper_type, "retailer_id": retailer_id, 'id': retailer_product_configuration_id, 'max_number_of_records': max_number_of_records, "index": index, "colorvariant": colorvariant, 'splash':{ 'args': { 'lua_source': self.splash_source}, 'endpoint': 'execute', } }, errback=self.error, dont_filter=true, ) # error handling def error(self, response): pass def parse(self, response): retailer_id = response.meta['retailer_id'] shopper_type = response.meta['shopper_type'] name = response.xpath('//h1[contains(@class,"product-content-info-name product-info-js")]/text()').extract() img = response.xpath('//div[@class=\'product-primary-image\']//img/@src').extract() # sku_code = name item = dashcrawleritem() item['name'] = response.xpath('//h1[contains(@class,"product-content-info-name product-info-js")]/text()').extract() item['selling_price'] = response.xpath('//span[contains(@class,"product-content-info-offer-price offer-price offer-price-js product-price-amount-js")]/text()').extract() item['description'] = response.xpath('//div[contains(@class,"desc-container pdp-details-desc-container")]/text()').extract() item['retailer_product_configuration_id'] = response.meta['id'] item['retailer_id'] = retailer_id item['crawl_date'] = datetime.datetime.utcnow().strftime("%m/%d/%y %h:%m:%s") item['shopper_type'] = shopper_type item['product_url'] = response.url item['sku_code'] = name item['reference_code'] = "cent-%s" % (name) item['preview_image_url'] = img yield item
i new python, scrapy, splash , lua well.
so spider gets urls scrolled api , performs scraping. above mentioned lucy store pdp returned api.
Comments
Post a Comment