new crawler

2024-09-19 16:03:50 +02:00 · 2017-12-27 12:14:52 +01:00 · 2017-12-27 12:14:52 +01:00 · c06347fa65
commit c06347fa65
parent bde2bed138
13 changed files with 237 additions and 5005 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,5 +7,3 @@ tmp
 *.pickle
 __pycache__/
 config.py
 !words.yaml
 !download.json
--- a/download.json
+++ b/download.json
--- a/fetch/fetch/init.py
+++ b/fetch/fetch/init.py
--- a/fetch/fetch/items.py
+++ b/fetch/fetch/items.py
@ -0,0 +1,14 @@
 # -*- coding: utf-8 -*-
 # Define here the models for your scraped items
 #
 # See documentation in:
 # http://doc.scrapy.org/en/latest/topics/items.html
 import scrapy
 class FetchItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
--- a/fetch/fetch/middlewares.py
+++ b/fetch/fetch/middlewares.py
@ -0,0 +1,56 @@
 # -*- coding: utf-8 -*-
 # Define here the models for your spider middleware
 #
 # See documentation in:
 # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
 class FetchSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        # Should return None or raise an exception.
        return None
    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.
        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i
    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.
        # Must return only requests (not items).
        for r in start_requests:
            yield r
    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
--- a/fetch/fetch/pipelines.py
+++ b/fetch/fetch/pipelines.py
@ -0,0 +1,11 @@
 # -*- coding: utf-8 -*-
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 class FetchPipeline(object):
    def process_item(self, item, spider):
        return item
--- a/fetch/fetch/settings.py
+++ b/fetch/fetch/settings.py
@ -0,0 +1,90 @@
 # -*- coding: utf-8 -*-
 # Scrapy settings for fetch project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
 #     http://doc.scrapy.org/en/latest/topics/settings.html
 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 BOT_NAME = 'fetch'
 SPIDER_MODULES = ['fetch.spiders']
 NEWSPIDER_MODULE = 'fetch.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'fetch (+http://www.yourdomain.com)'
 # USER_AGENT='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 DOWNLOAD_DELAY = .5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
 #DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
 #}
 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
 #    'fetch.middlewares.FetchSpiderMiddleware': 543,
 #}
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
 #    'fetch.middlewares.MyCustomDownloaderMiddleware': 543,
 #}
 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
 #}
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
 #    'fetch.pipelines.FetchPipeline': 300,
 #}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 #AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 #AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
 #AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 #HTTPCACHE_ENABLED = True
 #HTTPCACHE_EXPIRATION_SECS = 0
 #HTTPCACHE_DIR = 'httpcache'
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/fetch/fetch/spiders/init.py
+++ b/fetch/fetch/spiders/init.py
@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/fetch/fetch/spiders/nonsense.py
+++ b/fetch/fetch/spiders/nonsense.py
@ -0,0 +1,32 @@
 import logging
 from scrapy.spiders import Spider
 logger = logging.getLogger(__name__)
 class NonsenseSpider(Spider):
    name = "nonsense"
    start_urls = [
        "http://www.ikea.com/at/de/catalog/allproducts/",
        "http://www.ikea.com/at/de/catalog/allproducts/department/",
        "http://www.ikea.com/at/de/catalog/allproducts/alphabetical/"
    ]
    custom_settings = {
        'FEED_FORMAT': 'json',
        'FEED_URI': "../crawl.json"
    }
    def parse(self, response):
        for url in response.css(".productCategoryContainerWrapper a::attr(href)"):
            if url is not None:
                yield response.follow(url, callback=self.parse_product_list)
        pass
    def parse_product_list(self, response):
        products = response.css(".productDetails")
        for product in products:
            yield {
                'name': product.css(".productTitle::text").extract_first(),
                'description': product.css(".productDesp::text").extract_first(),
            }
--- a/fetch/scrapy.cfg
+++ b/fetch/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.org/en/latest/deploy.html
 [settings]
 default = fetch.settings
 [deploy]
 #url = http://localhost:6800/
 project = fetch
--- a/ikeagen.py
+++ b/ikeagen.py
@ -10,12 +10,12 @@ from PIL import Image
 def gen():
    table = [[[0 for i in range(221)] for j in range(221)] for k in range(221)]
-    # contents = open("ikeaname.txt").read().splitlines()
+    with open('crawl.json') as inputfile:
-    with open('download.json') as inputfile:
+        crawldata = json.load(inputfile)
-        contents = json.load(inputfile)["names"]
+    names = {result["name"] for result in crawldata}
    count = 0
-    for name in contents:
+    for name in names:
-        if name:
+        if name is not None and "„" not in name:
            name = "  " + name + "  "
            zeichen = list(name)
            zeichenl = len(zeichen)
--- a/prepare.py
+++ b/prepare.py
@ -1,37 +1,27 @@
 #!/usr/bin/env python3
-import yaml
+import json
 import re
 import yaml
-import requests
+with open('crawl.json', "r") as inputfile:
-from bs4 import BeautifulSoup
+    crawldata = json.load(inputfile)
-descriptions = []
+descriptions = {result["description"] for result in crawldata}
-names = []
+print(len(descriptions))
 for i in range(0, 25):
    r = requests.get("http://www.ikea.com/at/de/catalog/productsaz/{letter}/".format(letter=i))
    soup = BeautifulSoup(r.text, 'html.parser')
    for span in soup.find_all('span', "productsAzLink"):
        product = span.a.string
        m = re.match("((?:[^a-z\s]|Ä|Å|Ö){2,})? ?(.*)?", product)
        print(product)
        names.append(m.group(1))
        descriptions.append(m.group(2))
 data = {
    "descriptions": list(set(descriptions)),
    "names": list(set(names))
 }
 nouns = set()
 adj = set()
 digit = set()
 prefix = set()
 suffix = set()
-for d in (data["descriptions"]):
+for d in descriptions:
-    nouns.update(re.findall("([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d))
+    if d is not None:
-    adj.update(re.findall(" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d))
+        nouns.update(re.findall("([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d))
-    digit.update(re.findall(" ([\d]+[\w.-]{3,}[\w./]+)", d))
+        adj.update(re.findall(" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d))
-    prefix.update(re.findall("([\w.-]+-)", d))
+        digit.update(re.findall(" ([\d]+[\w.-]{3,}[\w./]+)", d))
-    suffix.update(re.findall("(-[\w.-]+)", d))
+        prefix.update(re.findall("([\w.-]+-)", d))
        suffix.update(re.findall("(-[\w.-]+)", d))
 words = {
    "nouns": list(nouns),
--- a/words.yaml
+++ b/words.yaml