2018-01-20 21:26:30 +01:00
|
|
|
import json
|
2021-05-12 18:41:01 +02:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
datadir = Path("crawlData")
|
2018-01-20 21:26:30 +01:00
|
|
|
|
|
|
|
|
|
|
|
def crawl_data():
|
|
|
|
all_data = []
|
2021-05-12 18:41:01 +02:00
|
|
|
for file in datadir.glob("*.json"):
|
|
|
|
with file.open() as inputfile:
|
2018-01-20 21:26:30 +01:00
|
|
|
all_data.extend(json.load(inputfile))
|
2021-05-12 18:41:01 +02:00
|
|
|
for file in datadir.glob("*.jsonl"):
|
|
|
|
with file.open() as inputfile:
|
|
|
|
for line in inputfile:
|
|
|
|
if not line or line == "\n":
|
|
|
|
continue
|
|
|
|
all_data.append(json.loads(line))
|
|
|
|
|
2018-01-20 21:26:30 +01:00
|
|
|
return all_data
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
print(crawl_data().__len__())
|