1
0
Fork 0
mirror of https://github.com/Findus23/nonsense.git synced 2024-09-19 16:03:50 +02:00
nonsense/prepare.py
2016-11-09 09:46:36 +01:00

44 lines
1.2 KiB
Python
Executable file

#!/usr/bin/env python3
import yaml
import re
import requests
from bs4 import BeautifulSoup
descriptions = []
names = []
for i in range(0, 25):
r = requests.get("http://www.ikea.com/at/de/catalog/productsaz/{letter}/".format(letter=i))
soup = BeautifulSoup(r.text, 'html.parser')
for span in soup.find_all('span', "productsAzLink"):
product = span.a.string
m = re.match("((?:[^a-z\s]|Ä|Å|Ö){2,})? ?(.*)?", product)
print(product)
names.append(m.group(1))
descriptions.append(m.group(2))
data = {
"descriptions": list(set(descriptions)),
"names": list(set(names))
}
nouns = set()
adj = set()
digit = set()
prefix = set()
suffix = set()
for d in (data["descriptions"]):
nouns.update(re.findall("([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d))
adj.update(re.findall(" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d))
digit.update(re.findall(" ([\d]+[\w.-]{3,}[\w./]+)", d))
prefix.update(re.findall("([\w.-]+-)", d))
suffix.update(re.findall("(-[\w.-]+)", d))
words = {
"nouns": list(nouns),
"adj": list(adj),
"digit": list(digit),
"prefix": list(prefix),
"suffix": list(suffix)
}
with open('words.yaml', 'w') as outfile:
yaml.dump(words, outfile, default_flow_style=False)