2016-07-19 15:53:11 +02:00
|
|
|
#!/usr/bin/env python3
|
2017-12-27 12:14:52 +01:00
|
|
|
|
2016-07-19 14:53:49 +02:00
|
|
|
import re
|
2017-12-27 12:14:52 +01:00
|
|
|
import yaml
|
2016-07-19 14:53:49 +02:00
|
|
|
|
2018-01-20 21:26:30 +01:00
|
|
|
import utils
|
|
|
|
|
|
|
|
crawldata = utils.crawl_data()
|
2016-07-19 14:53:49 +02:00
|
|
|
|
2017-12-27 12:14:52 +01:00
|
|
|
descriptions = {result["description"] for result in crawldata}
|
|
|
|
print(len(descriptions))
|
2016-11-08 15:00:15 +01:00
|
|
|
|
|
|
|
nouns = set()
|
|
|
|
adj = set()
|
|
|
|
digit = set()
|
|
|
|
prefix = set()
|
|
|
|
suffix = set()
|
2017-12-27 12:14:52 +01:00
|
|
|
for d in descriptions:
|
|
|
|
if d is not None:
|
|
|
|
nouns.update(re.findall("([A-ZÖÄÜ][^A-Z\s\dÖÄÜ\-/,+()\"]+)", d))
|
|
|
|
adj.update(re.findall(" ([^A-ZÖÄÜ\d]{3,}[^A-ZÖÄÜ\s\d])", d))
|
|
|
|
digit.update(re.findall(" ([\d]+[\w.-]{3,}[\w./]+)", d))
|
|
|
|
prefix.update(re.findall("([\w.-]+-)", d))
|
|
|
|
suffix.update(re.findall("(-[\w.-]+)", d))
|
2016-11-08 15:00:15 +01:00
|
|
|
|
|
|
|
words = {
|
2018-01-20 21:26:30 +01:00
|
|
|
"nouns": sorted(nouns),
|
|
|
|
"adj": sorted(adj),
|
|
|
|
"digit": sorted(digit),
|
|
|
|
"prefix": sorted(prefix),
|
|
|
|
"suffix": sorted(suffix)
|
2016-11-08 15:00:15 +01:00
|
|
|
}
|
2016-11-09 09:46:36 +01:00
|
|
|
with open('words.yaml', 'w') as outfile:
|
|
|
|
yaml.dump(words, outfile, default_flow_style=False)
|