1
0
Fork 0
mirror of https://github.com/Findus23/nonsense.git synced 2024-09-19 16:03:50 +02:00

new crawler

This commit is contained in:
Lukas Winkler 2017-12-27 12:14:52 +01:00
parent bde2bed138
commit c06347fa65
13 changed files with 237 additions and 5005 deletions

2
.gitignore vendored
View file

@ -7,5 +7,3 @@ tmp
*.pickle *.pickle
__pycache__/ __pycache__/
config.py config.py
!words.yaml
!download.json

File diff suppressed because it is too large Load diff

0
fetch/fetch/__init__.py Normal file
View file

14
fetch/fetch/items.py Normal file
View file

@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class FetchItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

View file

@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class FetchSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

11
fetch/fetch/pipelines.py Normal file
View file

@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class FetchPipeline(object):
def process_item(self, item, spider):
return item

90
fetch/fetch/settings.py Normal file
View file

@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
# Scrapy settings for fetch project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'fetch'
SPIDER_MODULES = ['fetch.spiders']
NEWSPIDER_MODULE = 'fetch.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'fetch (+http://www.yourdomain.com)'
# USER_AGENT='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = .5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'fetch.middlewares.FetchSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'fetch.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'fetch.pipelines.FetchPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View file

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View file

@ -0,0 +1,32 @@
import logging
from scrapy.spiders import Spider
logger = logging.getLogger(__name__)
class NonsenseSpider(Spider):
name = "nonsense"
start_urls = [
"http://www.ikea.com/at/de/catalog/allproducts/",
"http://www.ikea.com/at/de/catalog/allproducts/department/",
"http://www.ikea.com/at/de/catalog/allproducts/alphabetical/"
]
custom_settings = {
'FEED_FORMAT': 'json',
'FEED_URI': "../crawl.json"
}
def parse(self, response):
for url in response.css(".productCategoryContainerWrapper a::attr(href)"):
if url is not None:
yield response.follow(url, callback=self.parse_product_list)
pass
def parse_product_list(self, response):
products = response.css(".productDetails")
for product in products:
yield {
'name': product.css(".productTitle::text").extract_first(),
'description': product.css(".productDesp::text").extract_first(),
}

11
fetch/scrapy.cfg Normal file
View file

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = fetch.settings
[deploy]
#url = http://localhost:6800/
project = fetch

View file

@ -10,12 +10,12 @@ from PIL import Image
def gen(): def gen():
table = [[[0 for i in range(221)] for j in range(221)] for k in range(221)] table = [[[0 for i in range(221)] for j in range(221)] for k in range(221)]
# contents = open("ikeaname.txt").read().splitlines() with open('crawl.json') as inputfile:
with open('download.json') as inputfile: crawldata = json.load(inputfile)
contents = json.load(inputfile)["names"] names = {result["name"] for result in crawldata}
count = 0 count = 0
for name in contents: for name in names:
if name: if name is not None and "" not in name:
name = " " + name + " " name = " " + name + " "
zeichen = list(name) zeichen = list(name)
zeichenl = len(zeichen) zeichenl = len(zeichen)

View file

@ -1,37 +1,27 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import yaml import json
import re import re
import yaml
import requests with open('crawl.json', "r") as inputfile:
from bs4 import BeautifulSoup crawldata = json.load(inputfile)
descriptions = [] descriptions = {result["description"] for result in crawldata}
names = [] print(len(descriptions))
for i in range(0, 25):
r = requests.get("http://www.ikea.com/at/de/catalog/productsaz/{letter}/".format(letter=i))
soup = BeautifulSoup(r.text, 'html.parser')
for span in soup.find_all('span', "productsAzLink"):
product = span.a.string
m = re.match("((?:[^a-z\s]|Ä|Å|Ö){2,})? ?(.*)?", product)
print(product)
names.append(m.group(1))
descriptions.append(m.group(2))
data = {
"descriptions": list(set(descriptions)),
"names": list(set(names))
}
nouns = set() nouns = set()
adj = set() adj = set()
digit = set() digit = set()
prefix = set() prefix = set()
suffix = set() suffix = set()
for d in (data["descriptions"]): for d in descriptions:
nouns.update(re.findall("([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d)) if d is not None:
adj.update(re.findall(" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d)) nouns.update(re.findall("([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d))
digit.update(re.findall(" ([\d]+[\w.-]{3,}[\w./]+)", d)) adj.update(re.findall(" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d))
prefix.update(re.findall("([\w.-]+-)", d)) digit.update(re.findall(" ([\d]+[\w.-]{3,}[\w./]+)", d))
suffix.update(re.findall("(-[\w.-]+)", d)) prefix.update(re.findall("([\w.-]+-)", d))
suffix.update(re.findall("(-[\w.-]+)", d))
words = { words = {
"nouns": list(nouns), "nouns": list(nouns),

1424
words.yaml

File diff suppressed because it is too large Load diff