1
0
Fork 0
mirror of https://github.com/Findus23/nonsense.git synced 2024-09-19 16:03:50 +02:00
nonsense/prepare.py
2018-01-20 21:26:30 +01:00

34 lines
867 B
Python
Executable file

#!/usr/bin/env python3
import re
import yaml
import utils
crawldata = utils.crawl_data()
descriptions = {result["description"] for result in crawldata}
print(len(descriptions))
nouns = set()
adj = set()
digit = set()
prefix = set()
suffix = set()
for d in descriptions:
if d is not None:
nouns.update(re.findall("([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d))
adj.update(re.findall(" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d))
digit.update(re.findall(" ([\d]+[\w.-]{3,}[\w./]+)", d))
prefix.update(re.findall("([\w.-]+-)", d))
suffix.update(re.findall("(-[\w.-]+)", d))
words = {
"nouns": sorted(nouns),
"adj": sorted(adj),
"digit": sorted(digit),
"prefix": sorted(prefix),
"suffix": sorted(suffix)
}
with open('words.yaml', 'w') as outfile:
yaml.dump(words, outfile, default_flow_style=False)