mirror of
https://github.com/Findus23/nonsense.git
synced 2024-09-19 16:03:50 +02:00
update ikea crawler (old code)
This commit is contained in:
parent
480d74a318
commit
56a0f87e80
2 changed files with 18 additions and 19 deletions
|
@ -17,7 +17,7 @@ NEWSPIDER_MODULE = 'fetch.spiders'
|
|||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'fetch (+http://www.yourdomain.com)'
|
||||
# USER_AGENT='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
|
||||
USER_AGENT='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
|
@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
|
|||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
DOWNLOAD_DELAY = .5
|
||||
DOWNLOAD_DELAY = 2
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
@ -70,16 +70,16 @@ DOWNLOAD_DELAY = .5
|
|||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
AUTOTHROTTLE_DEBUG = True
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
|
@ -88,3 +88,6 @@ DOWNLOAD_DELAY = .5
|
|||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
|
||||
RETRY_TIMES= 20
|
|
@ -8,9 +8,7 @@ logger = logging.getLogger(__name__)
|
|||
class NonsenseSpider(Spider):
|
||||
name = "nonsense"
|
||||
start_urls = [
|
||||
"http://www.ikea.com/at/de/catalog/allproducts/",
|
||||
"http://www.ikea.com/at/de/catalog/allproducts/department/",
|
||||
"http://www.ikea.com/at/de/catalog/allproducts/alphabetical/"
|
||||
"https://www.ikea.com/at/de/cat/produkte-functional/",
|
||||
]
|
||||
custom_settings = {
|
||||
'FEED_FORMAT': 'json',
|
||||
|
@ -18,15 +16,13 @@ class NonsenseSpider(Spider):
|
|||
}
|
||||
|
||||
def parse(self, response):
|
||||
for url in response.css(".productCategoryContainerWrapper a::attr(href)"):
|
||||
if url is not None:
|
||||
yield response.follow(url, callback=self.parse_product_list)
|
||||
pass
|
||||
|
||||
def parse_product_list(self, response):
|
||||
products = response.css(".productDetails")
|
||||
products = response.css(".product-compact")
|
||||
for product in products:
|
||||
description = product.css(".product-compact__type::text").extract_first().strip().strip(",")
|
||||
yield {
|
||||
'name': product.css(".productTitle::text").extract_first(),
|
||||
'description': product.css(".productDesp::text").extract_first(),
|
||||
'name': product.css(".product-compact__name::text").extract_first(),
|
||||
'description': description,
|
||||
}
|
||||
for url in response.css("a.catalog-list__link::attr(href)"):
|
||||
if url is not None:
|
||||
yield response.follow(url, callback=self.parse)
|
||||
|
|
Loading…
Reference in a new issue