update ikea crawler (old code)

2024-09-19 16:03:50 +02:00 · 2019-12-20 18:19:44 +01:00 · 2019-12-20 18:19:44 +01:00 · 56a0f87e80
commit 56a0f87e80
parent 480d74a318
2 changed files with 18 additions and 19 deletions
--- a/fetch/fetch/settings.py
+++ b/fetch/fetch/settings.py
@ -17,7 +17,7 @@ NEWSPIDER_MODULE = 'fetch.spiders'

 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'fetch (+http://www.yourdomain.com)'
-# USER_AGENT='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
+USER_AGENT='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True

@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = .5
+DOWNLOAD_DELAY = 2
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@ -70,16 +70,16 @@ DOWNLOAD_DELAY = .5

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+AUTOTHROTTLE_DEBUG = True

 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
@ -88,3 +88,6 @@ DOWNLOAD_DELAY = .5
 #HTTPCACHE_DIR = 'httpcache'
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+
+RETRY_TIMES= 20
--- a/fetch/fetch/spiders/nonsense.py
+++ b/fetch/fetch/spiders/nonsense.py
@ -8,9 +8,7 @@ logger = logging.getLogger(__name__)
 class NonsenseSpider(Spider):
    name = "nonsense"
    start_urls = [
-        "http://www.ikea.com/at/de/catalog/allproducts/",
-        "http://www.ikea.com/at/de/catalog/allproducts/department/",
-        "http://www.ikea.com/at/de/catalog/allproducts/alphabetical/"
+        "https://www.ikea.com/at/de/cat/produkte-functional/",
    ]
    custom_settings = {
        'FEED_FORMAT': 'json',
@ -18,15 +16,13 @@ class NonsenseSpider(Spider):
    }

    def parse(self, response):
-        for url in response.css(".productCategoryContainerWrapper a::attr(href)"):
-            if url is not None:
-                yield response.follow(url, callback=self.parse_product_list)
-        pass
-
-    def parse_product_list(self, response):
-        products = response.css(".productDetails")
+        products = response.css(".product-compact")
        for product in products:
+            description = product.css(".product-compact__type::text").extract_first().strip().strip(",")
            yield {
-                'name': product.css(".productTitle::text").extract_first(),
-                'description': product.css(".productDesp::text").extract_first(),
+                'name': product.css(".product-compact__name::text").extract_first(),
+                'description': description,
            }
+        for url in response.css("a.catalog-list__link::attr(href)"):
+            if url is not None:
+                yield response.follow(url, callback=self.parse)