nonsense/prepare.py

#!/usr/bin/env python3

import re
from typing import Set

import tomli_w

import utils

crawldata = utils.crawl_data()

descriptions = {result["description"] for result in crawldata}
print(len(descriptions))

filter_regex = re.compile(r"(\d+[x-]\d+|\d+-|-\d+)", flags=re.IGNORECASE)


def postprocess(wordset: Set[str]):
    new_words = set()
    for word in wordset:
        word = word.strip()
        for replacephrase in ["+", "für ", "®"]:
            word = word.replace(replacephrase, "").strip()
        if filter_regex.match(word):
            continue
        if word in ["", "+"]:
            continue
        if "/" in word:
            new_words.update(postprocess(set(word.split("/"))))
        elif "," in word:
            new_words.update(postprocess(set(word.split(","))))
        else:
            new_words.add(word)
    return new_words


nouns = set()
adj = set()
digit = set()
prefix = set()
suffix = set()
for d in descriptions:
    if d is not None:
        nouns.update(re.findall(r"([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d))
        adj.update(re.findall(r" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d))
        digit.update(re.findall(r" ([\d]+[\w.-]{3,}[\w./]+)", d))
        prefix.update(re.findall(r"([\w.-]+-)", d))
        suffix.update(re.findall(r"(-[\w.-]+)", d))

words = {
    "nouns": sorted(postprocess(nouns)),
    "adj": sorted(postprocess(adj)),
    "digit": sorted(postprocess(digit)),
    "prefix": sorted(postprocess(prefix)),
    "suffix": sorted(postprocess(suffix))
}
with open('words.toml', 'wb') as outfile:
    tomli_w.dump(words, outfile)