add data of all crawls together

2024-09-19 16:03:50 +02:00 · 2018-01-20 21:26:30 +01:00 · 2018-01-20 21:26:30 +01:00 · bba1865186
commit bba1865186
parent c2bed3a527
5 changed files with 33 additions and 21 deletions
--- a/fetch/fetch/spiders/nonsense.py
+++ b/fetch/fetch/spiders/nonsense.py
@ -14,7 +14,7 @@ class NonsenseSpider(Spider):
    ]
    custom_settings = {
        'FEED_FORMAT': 'json',
-        'FEED_URI': "../crawl.json"
+        'FEED_URI': "../crawlData/crawl.json"
    }

    def parse(self, response):
--- a/ikeagen.py
+++ b/ikeagen.py
@ -1,17 +1,16 @@
 #!/usr/bin/python3
-import json
-
 import pickle

 import os
 import random
 from PIL import Image

+import utils
+

 def gen():
    table = [[[0 for i in range(221)] for j in range(221)] for k in range(221)]
-    with open('crawl.json') as inputfile:
-        crawldata = json.load(inputfile)
+    crawldata = utils.crawl_data()
    names = {result["name"] for result in crawldata}
    count = 0
    for name in names:
@ -30,11 +29,11 @@ def gen():

 def save(data):
    with open('ikeaname.pickle', 'wb') as outfile:
-        pickle.dump(data, outfile,pickle.HIGHEST_PROTOCOL)
+        pickle.dump(data, outfile, pickle.HIGHEST_PROTOCOL)


 def load():
-    with open('ikeaname.pickle',"rb") as inputfile:
+    with open('ikeaname.pickle', "rb") as inputfile:
        table = pickle.load(inputfile)
    return table

@ -88,7 +87,6 @@ def generate():
                a = b = 32


-
 if __name__ == "__main__":
    for _ in range(100):
        print(generate())
--- a/prepare.py
+++ b/prepare.py
@ -1,11 +1,11 @@
 #!/usr/bin/env python3
-import json

 import re
 import yaml

-with open('crawl.json', "r") as inputfile:
-    crawldata = json.load(inputfile)
+import utils
+
+crawldata = utils.crawl_data()

 descriptions = {result["description"] for result in crawldata}
 print(len(descriptions))
@ -24,11 +24,11 @@ for d in descriptions:
        suffix.update(re.findall("(-[\w.-]+)", d))

 words = {
-    "nouns": list(nouns),
-    "adj": list(adj),
-    "digit": list(digit),
-    "prefix": list(prefix),
-    "suffix": list(suffix)
+    "nouns": sorted(nouns),
+    "adj": sorted(adj),
+    "digit": sorted(digit),
+    "prefix": sorted(prefix),
+    "suffix": sorted(suffix)
 }
 with open('words.yaml', 'w') as outfile:
    yaml.dump(words, outfile, default_flow_style=False)
--- a/telegramBot.py
+++ b/telegramBot.py
@ -45,7 +45,7 @@ def subscribe(bot, update, job_queue):
    chat_id = update.message.chat_id
    # Add job to queue
    if chat_id in subscriptions:
-        update.message.reply_text('You are already subscribed')
+        update.message.reply_text('Du bist bereits angemeldet')
        return

    job = job_queue.run_daily(subscribe_notification,
@ -53,7 +53,7 @@ def subscribe(bot, update, job_queue):
                              time=datetime.datetime.now().replace(minute=0, hour=8, second=0)
                                   + datetime.timedelta(days=1))
    subscriptions[chat_id] = job
-    update.message.reply_text('Successfully subscribed')
+    update.message.reply_text('erfolgreich angemeldet')


 def unsubscribe(bot, update):
@ -61,14 +61,14 @@ def unsubscribe(bot, update):
    chat_id = update.message.chat_id

    if chat_id not in subscriptions:
-        update.message.reply_text('You have no subscription')
+        update.message.reply_text('Du nicht angemeldet')
        return

    # Add job to queue
    job = subscriptions[chat_id]
    job.schedule_removal()
    del subscriptions[chat_id]
-    update.message.reply_text('Successfully unsubscribed')
+    update.message.reply_text('erfolgreich abgemeldet')


 def multiple(bot, update, args):
@ -83,7 +83,7 @@ def multiple(bot, update, args):
            descriptions.append("+++ " + generate.get_description() + " +++")
        update.message.reply_text("\n".join(descriptions))
    except (IndexError, ValueError):
-        update.message.reply_text('Usage: /multiple <count>')
+        update.message.reply_text('Verwendung: /mehrere <anzahl>')


 def error(bot, update, error):
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,14 @@
+import glob
+import json
+
+
+def crawl_data():
+    all_data = []
+    for file in glob.glob("crawlData/*.json"):
+        with open(file, "r") as inputfile:
+            all_data.extend(json.load(inputfile))
+    return all_data
+
+
+if __name__ == "__main__":
+    print(crawl_data().__len__())