1
0
Fork 0
mirror of https://github.com/Findus23/nonsense.git synced 2024-09-19 16:03:50 +02:00

update ikea crawler (old code)

This commit is contained in:
Lukas Winkler 2019-12-20 18:19:44 +01:00
parent 480d74a318
commit 56a0f87e80
Signed by: lukas
GPG key ID: 54DE4D798D244853
2 changed files with 18 additions and 19 deletions

View file

@ -17,7 +17,7 @@ NEWSPIDER_MODULE = 'fetch.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'fetch (+http://www.yourdomain.com)'
# USER_AGENT='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
USER_AGENT='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = .5
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
@ -70,16 +70,16 @@ DOWNLOAD_DELAY = .5
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
AUTOTHROTTLE_DEBUG = True
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
@ -88,3 +88,6 @@ DOWNLOAD_DELAY = .5
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
RETRY_TIMES= 20

View file

@ -8,9 +8,7 @@ logger = logging.getLogger(__name__)
class NonsenseSpider(Spider):
name = "nonsense"
start_urls = [
"http://www.ikea.com/at/de/catalog/allproducts/",
"http://www.ikea.com/at/de/catalog/allproducts/department/",
"http://www.ikea.com/at/de/catalog/allproducts/alphabetical/"
"https://www.ikea.com/at/de/cat/produkte-functional/",
]
custom_settings = {
'FEED_FORMAT': 'json',
@ -18,15 +16,13 @@ class NonsenseSpider(Spider):
}
def parse(self, response):
for url in response.css(".productCategoryContainerWrapper a::attr(href)"):
if url is not None:
yield response.follow(url, callback=self.parse_product_list)
pass
def parse_product_list(self, response):
products = response.css(".productDetails")
products = response.css(".product-compact")
for product in products:
description = product.css(".product-compact__type::text").extract_first().strip().strip(",")
yield {
'name': product.css(".productTitle::text").extract_first(),
'description': product.css(".productDesp::text").extract_first(),
'name': product.css(".product-compact__name::text").extract_first(),
'description': description,
}
for url in response.css("a.catalog-list__link::attr(href)"):
if url is not None:
yield response.follow(url, callback=self.parse)