nonsense/prepare.py

#!/usr/bin/env python3

import re
from typing import Set

import tomli_w

import utils

crawldata = utils.crawl_data()

descriptions = {result["description"] for result in crawldata}
print(len(descriptions))

filter_regex = re.compile(r"(\d+[x-]\d+|\d+-|-\d+)", flags=re.IGNORECASE)


def postprocess(wordset: Set[str]):
    new_words = set()
    for word in wordset:
        word = word.strip()
        for replacephrase in ["+", "für ", "®"]:
            word = word.replace(replacephrase, "").strip()
        if filter_regex.match(word):
            continue
        if word in ["", "+"]:
            continue
        if "/" in word:
            new_words.update(postprocess(set(word.split("/"))))
        elif "," in word:
            new_words.update(postprocess(set(word.split(","))))
        else:
            new_words.add(word)
    return new_words


nouns = set()
adj = set()
digit = set()
prefix = set()
suffix = set()
for d in descriptions:
    if d is not None:
        nouns.update(re.findall(r"([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d))
        adj.update(re.findall(r" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d))
        digit.update(re.findall(r" ([\d]+[\w.-]{3,}[\w./]+)", d))
        prefix.update(re.findall(r"([\w.-]+-)", d))
        suffix.update(re.findall(r"(-[\w.-]+)", d))

words = {
    "nouns": sorted(postprocess(nouns)),
    "adj": sorted(postprocess(adj)),
    "digit": sorted(postprocess(digit)),
    "prefix": sorted(postprocess(prefix)),
    "suffix": sorted(postprocess(suffix))
}
with open('words.toml', 'wb') as outfile:
    tomli_w.dump(words, outfile)
Neue Version fertig 2016-07-19 15:53:11 +02:00			`#!/usr/bin/env python3`
new crawler 2017-12-27 12:14:52 +01:00
Neu in Python 2016-07-19 14:53:49 +02:00			`import re`
update to latest website 2021-05-12 18:41:01 +02:00			`from typing import Set`

minor updates 2023-02-04 21:45:50 +01:00			`import tomli_w`
Neu in Python 2016-07-19 14:53:49 +02:00
add data of all crawls together 2018-01-20 21:26:30 +01:00			`import utils`

			`crawldata = utils.crawl_data()`
Neu in Python 2016-07-19 14:53:49 +02:00
new crawler 2017-12-27 12:14:52 +01:00			`descriptions = {result["description"] for result in crawldata}`
			`print(len(descriptions))`
API 2016-11-08 15:00:15 +01:00
update to latest website 2021-05-12 18:41:01 +02:00			`filter_regex = re.compile(r"(\d+[x-]\d+\|\d+-\|-\d+)", flags=re.IGNORECASE)`


			`def postprocess(wordset: Set[str]):`
			`new_words = set()`
			`for word in wordset:`
			`word = word.strip()`
			`for replacephrase in ["+", "für ", "®"]:`
			`word = word.replace(replacephrase, "").strip()`
			`if filter_regex.match(word):`
			`continue`
			`if word in ["", "+"]:`
			`continue`
			`if "/" in word:`
			`new_words.update(postprocess(set(word.split("/"))))`
			`elif "," in word:`
			`new_words.update(postprocess(set(word.split(","))))`
			`else:`
			`new_words.add(word)`
			`return new_words`


API 2016-11-08 15:00:15 +01:00			`nouns = set()`
			`adj = set()`
			`digit = set()`
			`prefix = set()`
			`suffix = set()`
new crawler 2017-12-27 12:14:52 +01:00			`for d in descriptions:`
			`if d is not None:`
update to latest website 2021-05-12 18:41:01 +02:00			`nouns.update(re.findall(r"([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d))`
			`adj.update(re.findall(r" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d))`
			`digit.update(re.findall(r" ([\d]+[\w.-]{3,}[\w./]+)", d))`
			`prefix.update(re.findall(r"([\w.-]+-)", d))`
			`suffix.update(re.findall(r"(-[\w.-]+)", d))`
API 2016-11-08 15:00:15 +01:00
			`words = {`
update to latest website 2021-05-12 18:41:01 +02:00			`"nouns": sorted(postprocess(nouns)),`
			`"adj": sorted(postprocess(adj)),`
			`"digit": sorted(postprocess(digit)),`
			`"prefix": sorted(postprocess(prefix)),`
			`"suffix": sorted(postprocess(suffix))`
API 2016-11-08 15:00:15 +01:00			`}`
minor updates 2023-02-04 21:45:50 +01:00			`with open('words.toml', 'wb') as outfile:`
			`tomli_w.dump(words, outfile)`