mirror of
https://github.com/Findus23/nonsense.git
synced 2024-09-09 04:23:50 +02:00
58 lines
1.6 KiB
Python
Executable file
58 lines
1.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
import re
|
|
from typing import Set
|
|
|
|
import tomli_w
|
|
|
|
import utils
|
|
|
|
crawldata = utils.crawl_data()
|
|
|
|
descriptions = {result["description"] for result in crawldata}
|
|
print(len(descriptions))
|
|
|
|
filter_regex = re.compile(r"(\d+[x-]\d+|\d+-|-\d+)", flags=re.IGNORECASE)
|
|
|
|
|
|
def postprocess(wordset: Set[str]):
|
|
new_words = set()
|
|
for word in wordset:
|
|
word = word.strip()
|
|
for replacephrase in ["+", "für ", "®"]:
|
|
word = word.replace(replacephrase, "").strip()
|
|
if filter_regex.match(word):
|
|
continue
|
|
if word in ["", "+"]:
|
|
continue
|
|
if "/" in word:
|
|
new_words.update(postprocess(set(word.split("/"))))
|
|
elif "," in word:
|
|
new_words.update(postprocess(set(word.split(","))))
|
|
else:
|
|
new_words.add(word)
|
|
return new_words
|
|
|
|
|
|
nouns = set()
|
|
adj = set()
|
|
digit = set()
|
|
prefix = set()
|
|
suffix = set()
|
|
for d in descriptions:
|
|
if d is not None:
|
|
nouns.update(re.findall(r"([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d))
|
|
adj.update(re.findall(r" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d))
|
|
digit.update(re.findall(r" ([\d]+[\w.-]{3,}[\w./]+)", d))
|
|
prefix.update(re.findall(r"([\w.-]+-)", d))
|
|
suffix.update(re.findall(r"(-[\w.-]+)", d))
|
|
|
|
words = {
|
|
"nouns": sorted(postprocess(nouns)),
|
|
"adj": sorted(postprocess(adj)),
|
|
"digit": sorted(postprocess(digit)),
|
|
"prefix": sorted(postprocess(prefix)),
|
|
"suffix": sorted(postprocess(suffix))
|
|
}
|
|
with open('words.toml', 'wb') as outfile:
|
|
tomli_w.dump(words, outfile)
|