python 2.7 - Unable to get the text from a span using splash-scrapy -


i trying few examples scrape following name, description,price , image:

http://www.lucy.com/shop/shop-by-activity-running/extra-mile-top-0a2s96?variationid=005

i facing 2 problems here:

a) able name , description correctly price not correctly getting scrapped. values receive price following:

'selling_price': [u'\xa0'],

'selling_price': [u'$35.00'],

b) don't know how can extract image there since canvas element shows image when inspect in firebug. maybe need use javascript struggling with.

following spider have created:

# -*- coding: utf-8 -*- import scrapy import urllib2 import datetime import simplejson json scrapy.spiders import crawlspider dashcrawler.items import dashcrawleritem dashcrawler import constant  class lucyspider(crawlspider):     name = "lucy"     custom_settings = {         "dupefilter_class": 'dashcrawler.middlewares.customfilter',         "downloader_clientcontextfactory":                                             'dashcrawler.context.customcontextfactory',         # "user_agent": "mozilla/5.0 (x11; linux x86_64) applewebkit/537.36 (khtml, gecko) chrome/49.0.2623.75 safari/537.36",     "retry_times": 4,     "retry_http_codes": [504, 400],     "concurrent_requests": 1,     'downloader_middlewares': {         'scrapy.downloadermiddlewares.useragent.useragentmiddleware': none,         'scrapy.downloadermiddlewares.httpproxy.httpproxymiddleware': 110,         # 'scrapy_fake_useragent.middleware.randomuseragentmiddleware': 400,         'scrapy_splash.splashcookiesmiddleware': 723,         'scrapy_splash.splashmiddleware': 725,         'scrapy.downloadermiddlewares.httpcompression.httpcompressionmiddleware': 810,         'scrapy.downloadermiddlewares.httpauth.httpauthmiddleware': 725,     } } constant = constant.constants_by_spider(name) http_user = constant['http_user'] http_pass = constant['http_pass'] proxy_ip = constant['proxy_ip'] proxy_auth = constant['proxy_auth'] gender_types = constant['gender_types'] no_result_found = "nenhum resultado encontrado para consulta" availability = "continuar com compra" scrapy_details = constant['scrapy_details'] allowed_domains = constant['allowed_domains'] api_url_call = constant['api_url_call'] count = constant['login_types'] zipcodes_count = 0 start_urls = []   accounts = []  splash_source = """function main(splash)                         local url = splash.args.url                         assert(splash:go(url))                         assert(splash:wait(1.0))                         return splash:html()                         end"""  def start_requests(self):     request = urllib2.request(self.api_url_call, headers=self.constant['headers'])     retailer_product_configurations = json.load(urllib2.urlopen(request))     self.scrapy_details = retailer_product_configurations     login_type = self.scrapy_details["shopper_type"]     is_register = "registered" in login_type     self.accounts = self.scrapy_details["accounts"]     retailer_product_configurations = self.scrapy_details['retailerproductconfigurationurls']     anonymous_user_types = set(["anonymous", "both"]) & set(login_type)     #import pudb; pudb.set_trace()     if (len(anonymous_user_types) >= 0 , is_register == false) or (len(self.accounts) == 0):         yield scrapy.request(             self.api_url_call,             callback=self.main_url_parse,             headers=self.constant['headers'],             meta={                 "shopper_type": "anonymous",                 "retailer_product_configurations": retailer_product_configurations,                 "index": 0,             },             errback=self.error         )  def main_url_parse(self, response):     product_store = []     index = response.meta['index']     retailer_product_configurations = response.meta['retailer_product_configurations']     shopper_type = response.meta['shopper_type']     self.request_count = len(retailer_product_configurations)     # index = 0     retailer_product_configuration in retailer_product_configurations:         url = retailer_product_configuration['url']         page_type = retailer_product_configuration['urltype']         retailer_id = retailer_product_configuration['retailerid']         max_number_of_records = retailer_product_configuration['maxnumberofrecords']         retailer_product_configuration_id = retailer_product_configuration['id']         product_url = "self.color_url" if page_type == "pdp" else "self.parse"         product_store.append(url)         index += 1         colorvariant = retailer_product_configuration.get('colorvariant', true)         colorvariant = true if colorvariant == none else colorvariant         yield scrapy.request(             url,             callback=  self.parse,             meta={"product_store": product_store,                   "start_url": url,                   "shopper_type": shopper_type,                   "retailer_id": retailer_id,                   'id': retailer_product_configuration_id,                   'max_number_of_records': max_number_of_records,                   "index": index,                   "colorvariant": colorvariant,                   'splash':{                               'args': {                                 'lua_source': self.splash_source},                                 'endpoint': 'execute',                             }                   },             errback=self.error,             dont_filter=true,         )  # error handling def error(self, response):     pass   def parse(self, response):         retailer_id = response.meta['retailer_id']         shopper_type = response.meta['shopper_type']         name = response.xpath('//h1[contains(@class,"product-content-info-name product-info-js")]/text()').extract()         img = response.xpath('//div[@class=\'product-primary-image\']//img/@src').extract()        # sku_code = name         item = dashcrawleritem()         item['name'] = response.xpath('//h1[contains(@class,"product-content-info-name product-info-js")]/text()').extract()         item['selling_price'] = response.xpath('//span[contains(@class,"product-content-info-offer-price offer-price offer-price-js product-price-amount-js")]/text()').extract()         item['description'] = response.xpath('//div[contains(@class,"desc-container pdp-details-desc-container")]/text()').extract()         item['retailer_product_configuration_id'] = response.meta['id']         item['retailer_id'] = retailer_id         item['crawl_date'] = datetime.datetime.utcnow().strftime("%m/%d/%y %h:%m:%s")         item['shopper_type'] = shopper_type         item['product_url'] = response.url         item['sku_code'] = name         item['reference_code'] = "cent-%s" % (name)         item['preview_image_url'] = img         yield item 

i new python, scrapy, splash , lua well.

so spider gets urls scrolled api , performs scraping. above mentioned lucy store pdp returned api.


Comments

Popular posts from this blog

asynchronous - C# WinSCP .NET assembly: How to upload multiple files asynchronously -

aws api gateway - SerializationException in posting new Records via Dynamodb Proxy Service in API -

asp.net - Problems sending emails from forum -