1
0
Fork 0
mirror of https://github.com/Findus23/nonsense.git synced 2024-09-19 16:03:50 +02:00
nonsense/prepare.py

59 lines
1.6 KiB
Python
Raw Normal View History

2016-07-19 15:53:11 +02:00
#!/usr/bin/env python3
2017-12-27 12:14:52 +01:00
2016-07-19 14:53:49 +02:00
import re
2021-05-12 18:41:01 +02:00
from typing import Set
2023-02-04 21:45:50 +01:00
import tomli_w
2016-07-19 14:53:49 +02:00
2018-01-20 21:26:30 +01:00
import utils
crawldata = utils.crawl_data()
2016-07-19 14:53:49 +02:00
2017-12-27 12:14:52 +01:00
descriptions = {result["description"] for result in crawldata}
print(len(descriptions))
2016-11-08 15:00:15 +01:00
2021-05-12 18:41:01 +02:00
filter_regex = re.compile(r"(\d+[x-]\d+|\d+-|-\d+)", flags=re.IGNORECASE)
def postprocess(wordset: Set[str]):
new_words = set()
for word in wordset:
word = word.strip()
for replacephrase in ["+", "für ", "®"]:
word = word.replace(replacephrase, "").strip()
if filter_regex.match(word):
continue
if word in ["", "+"]:
continue
if "/" in word:
new_words.update(postprocess(set(word.split("/"))))
elif "," in word:
new_words.update(postprocess(set(word.split(","))))
else:
new_words.add(word)
return new_words
2016-11-08 15:00:15 +01:00
nouns = set()
adj = set()
digit = set()
prefix = set()
suffix = set()
2017-12-27 12:14:52 +01:00
for d in descriptions:
if d is not None:
2021-05-12 18:41:01 +02:00
nouns.update(re.findall(r"([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d))
adj.update(re.findall(r" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d))
digit.update(re.findall(r" ([\d]+[\w.-]{3,}[\w./]+)", d))
prefix.update(re.findall(r"([\w.-]+-)", d))
suffix.update(re.findall(r"(-[\w.-]+)", d))
2016-11-08 15:00:15 +01:00
words = {
2021-05-12 18:41:01 +02:00
"nouns": sorted(postprocess(nouns)),
"adj": sorted(postprocess(adj)),
"digit": sorted(postprocess(digit)),
"prefix": sorted(postprocess(prefix)),
"suffix": sorted(postprocess(suffix))
2016-11-08 15:00:15 +01:00
}
2023-02-04 21:45:50 +01:00
with open('words.toml', 'wb') as outfile:
tomli_w.dump(words, outfile)