1
0
Fork 0
mirror of https://github.com/Findus23/nonsense.git synced 2024-09-19 16:03:50 +02:00

add data of all crawls together

This commit is contained in:
Lukas Winkler 2018-01-20 21:26:30 +01:00
parent c2bed3a527
commit bba1865186
No known key found for this signature in database
GPG key ID: 94AFBE7C2656A5B5
5 changed files with 33 additions and 21 deletions

View file

@ -14,7 +14,7 @@ class NonsenseSpider(Spider):
] ]
custom_settings = { custom_settings = {
'FEED_FORMAT': 'json', 'FEED_FORMAT': 'json',
'FEED_URI': "../crawl.json" 'FEED_URI': "../crawlData/crawl.json"
} }
def parse(self, response): def parse(self, response):

View file

@ -1,17 +1,16 @@
#!/usr/bin/python3 #!/usr/bin/python3
import json
import pickle import pickle
import os import os
import random import random
from PIL import Image from PIL import Image
import utils
def gen(): def gen():
table = [[[0 for i in range(221)] for j in range(221)] for k in range(221)] table = [[[0 for i in range(221)] for j in range(221)] for k in range(221)]
with open('crawl.json') as inputfile: crawldata = utils.crawl_data()
crawldata = json.load(inputfile)
names = {result["name"] for result in crawldata} names = {result["name"] for result in crawldata}
count = 0 count = 0
for name in names: for name in names:
@ -30,11 +29,11 @@ def gen():
def save(data): def save(data):
with open('ikeaname.pickle', 'wb') as outfile: with open('ikeaname.pickle', 'wb') as outfile:
pickle.dump(data, outfile,pickle.HIGHEST_PROTOCOL) pickle.dump(data, outfile, pickle.HIGHEST_PROTOCOL)
def load(): def load():
with open('ikeaname.pickle',"rb") as inputfile: with open('ikeaname.pickle', "rb") as inputfile:
table = pickle.load(inputfile) table = pickle.load(inputfile)
return table return table
@ -88,7 +87,6 @@ def generate():
a = b = 32 a = b = 32
if __name__ == "__main__": if __name__ == "__main__":
for _ in range(100): for _ in range(100):
print(generate()) print(generate())

View file

@ -1,11 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import json
import re import re
import yaml import yaml
with open('crawl.json', "r") as inputfile: import utils
crawldata = json.load(inputfile)
crawldata = utils.crawl_data()
descriptions = {result["description"] for result in crawldata} descriptions = {result["description"] for result in crawldata}
print(len(descriptions)) print(len(descriptions))
@ -24,11 +24,11 @@ for d in descriptions:
suffix.update(re.findall("(-[\w.-]+)", d)) suffix.update(re.findall("(-[\w.-]+)", d))
words = { words = {
"nouns": list(nouns), "nouns": sorted(nouns),
"adj": list(adj), "adj": sorted(adj),
"digit": list(digit), "digit": sorted(digit),
"prefix": list(prefix), "prefix": sorted(prefix),
"suffix": list(suffix) "suffix": sorted(suffix)
} }
with open('words.yaml', 'w') as outfile: with open('words.yaml', 'w') as outfile:
yaml.dump(words, outfile, default_flow_style=False) yaml.dump(words, outfile, default_flow_style=False)

View file

@ -45,7 +45,7 @@ def subscribe(bot, update, job_queue):
chat_id = update.message.chat_id chat_id = update.message.chat_id
# Add job to queue # Add job to queue
if chat_id in subscriptions: if chat_id in subscriptions:
update.message.reply_text('You are already subscribed') update.message.reply_text('Du bist bereits angemeldet')
return return
job = job_queue.run_daily(subscribe_notification, job = job_queue.run_daily(subscribe_notification,
@ -53,7 +53,7 @@ def subscribe(bot, update, job_queue):
time=datetime.datetime.now().replace(minute=0, hour=8, second=0) time=datetime.datetime.now().replace(minute=0, hour=8, second=0)
+ datetime.timedelta(days=1)) + datetime.timedelta(days=1))
subscriptions[chat_id] = job subscriptions[chat_id] = job
update.message.reply_text('Successfully subscribed') update.message.reply_text('erfolgreich angemeldet')
def unsubscribe(bot, update): def unsubscribe(bot, update):
@ -61,14 +61,14 @@ def unsubscribe(bot, update):
chat_id = update.message.chat_id chat_id = update.message.chat_id
if chat_id not in subscriptions: if chat_id not in subscriptions:
update.message.reply_text('You have no subscription') update.message.reply_text('Du nicht angemeldet')
return return
# Add job to queue # Add job to queue
job = subscriptions[chat_id] job = subscriptions[chat_id]
job.schedule_removal() job.schedule_removal()
del subscriptions[chat_id] del subscriptions[chat_id]
update.message.reply_text('Successfully unsubscribed') update.message.reply_text('erfolgreich abgemeldet')
def multiple(bot, update, args): def multiple(bot, update, args):
@ -83,7 +83,7 @@ def multiple(bot, update, args):
descriptions.append("+++ " + generate.get_description() + " +++") descriptions.append("+++ " + generate.get_description() + " +++")
update.message.reply_text("\n".join(descriptions)) update.message.reply_text("\n".join(descriptions))
except (IndexError, ValueError): except (IndexError, ValueError):
update.message.reply_text('Usage: /multiple <count>') update.message.reply_text('Verwendung: /mehrere <anzahl>')
def error(bot, update, error): def error(bot, update, error):

14
utils.py Normal file
View file

@ -0,0 +1,14 @@
import glob
import json
def crawl_data():
all_data = []
for file in glob.glob("crawlData/*.json"):
with open(file, "r") as inputfile:
all_data.extend(json.load(inputfile))
return all_data
if __name__ == "__main__":
print(crawl_data().__len__())