#!/usr/bin/env python3 import yaml import re import requests from bs4 import BeautifulSoup descriptions = [] names = [] for i in range(0, 25): r = requests.get("http://www.ikea.com/at/de/catalog/productsaz/{letter}/".format(letter=i)) soup = BeautifulSoup(r.text, 'html.parser') for span in soup.find_all('span', "productsAzLink"): product = span.a.string m = re.match("((?:[^a-z\s]|Ä|Å|Ö){2,})? ?(.*)?", product) print(product) names.append(m.group(1)) descriptions.append(m.group(2)) data = { "descriptions": list(set(descriptions)), "names": list(set(names)) } nouns = set() adj = set() digit = set() prefix = set() suffix = set() for d in (data["descriptions"]): nouns.update(re.findall("([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d)) adj.update(re.findall(" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d)) digit.update(re.findall(" ([\d]+[\w.-]{3,}[\w./]+)", d)) prefix.update(re.findall("([\w.-]+-)", d)) suffix.update(re.findall("(-[\w.-]+)", d)) words = { "nouns": list(nouns), "adj": list(adj), "digit": list(digit), "prefix": list(prefix), "suffix": list(suffix) } with open('words.yaml', 'w') as outfile: yaml.dump(words, outfile, default_flow_style=False)