import jsonlines import markovify from nltk.tokenize.moses import MosesDetokenizer, MosesTokenizer from markov import POSifiedText from utils import * detokenizer = MosesDetokenizer() BASEDIR, mode = get_settings(2) if mode not in ["Questions", "Answers", "Titles"]: print("error") exit() chainfile = BASEDIR + '/{type}.chain.json'.format(type=mode) try: with open(chainfile, 'r') as myfile: data = myfile.read() chain = POSifiedText.from_json(data) # raise FileNotFoundError print("using existing file\n") except FileNotFoundError: tokenizer = MosesTokenizer() combined_cains = None chainlist = [] i = 0 with jsonlines.open(BASEDIR + "/{type}.jsonl".format(type=mode), mode="r") as content: for text in content: text = text.strip() # tokens = tokenizer.tokenize(text=text.replace("\n", " THISISANEWLINE ")) try: chain = POSifiedText(text, (1 if mode == "Titles" else 2), retain_original=False) # chain = markovify.Chain([tokens], (1 if mode == "Titles" else 2)) except KeyError: continue chainlist.append(chain) if i % 100 == 0: print(i) if i % 1000 == 0: subtotal_chain = markovify.combine(chainlist) if not combined_cains: combined_cains = subtotal_chain else: combined_cains = markovify.combine(models=[combined_cains, subtotal_chain]) chainlist = [] i += 1 subtotal_chain = markovify.combine(chainlist) chain = markovify.combine([combined_cains, subtotal_chain]) with open(chainfile, 'w') as outfile: outfile.write(chain.to_json()) for _ in range(10): # walk = [] # for text in chain.gen(): # if len(walk) > 100: # break # walk.append(text) # result = detokenizer.detokenize(walk, return_str=True) # print(result.replace("THISISANEWLINE ", "\n")) print(chain.make_sentence()) print("-----------------------------------") print_ram()