1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-19 15:23:44 +02:00

better line grouping

This commit is contained in:
Lukas Winkler 2021-07-16 11:51:05 +02:00
parent 311f4225a5
commit 7f44dd04c6
Signed by: lukas
GPG key ID: 54DE4D798D244853
3 changed files with 43 additions and 27 deletions

View file

@ -1,7 +1,6 @@
import os
import re
from html import unescape
from itertools import groupby
from typing import List, Optional, Set, Union
from alive_progress import alive_bar
@ -39,8 +38,26 @@ def line_key(line: Line) -> Union[str, Line]:
def group_lines(dblines: List[Line]) -> List[Line]:
final_lines = []
order = 0
for _, group in groupby(dblines, key=line_key):
group = list(group)
index = 0
groups = []
group = []
last_key = None
while index < len(dblines):
line = dblines[index]
key = line_key(line)
if last_key != key and group:
groups.append(group)
group = []
last_key = key
group.append(line)
if line.text[-1] in [".", "!", "?"] and group:
groups.append(group)
group = []
index += 1
for group in groups:
first_line = group[0]
dbline = Line()
dbline.text = " ".join([line.text for line in group])
@ -111,7 +128,17 @@ def main() -> None:
line = line[1:]
if ":" not in line:
text = add_to_text(text, line)
if text.startswith("(") and text.endswith(")"):
dblines.append(insert_subtitle(text, None, subline, episode, isnote=True, order=i))
i += 1
text = ""
if text.startswith("[") and text.endswith("]"):
dblines.append(insert_subtitle(text, None, subline, episode, ismeta=True, order=i))
text = ""
i += 1
continue
name, resttext = line.split(":", maxsplit=1)
if is_invalid_name(name) or not name[-1].isupper():
text = add_to_text(text, line)
@ -122,22 +149,6 @@ def main() -> None:
i += 1
text = ""
if text.startswith("(") and text.endswith(")"):
text = add_to_text(text, line)
person = None
dblines.append(insert_subtitle(text, person, subline, episode, isnote=True, order=i))
i += 1
text = ""
continue
if text.startswith("[") and text.endswith("]"):
text = add_to_text(text, line)
person = None
dblines.append(insert_subtitle(text, person, subline, episode, ismeta=True, order=i))
text = ""
i += 1
continue
people = []
name = name.lower()
for word in re.split('[,&/]|and| an ', name):
@ -155,7 +166,8 @@ def main() -> None:
text = ""
i += 1
dblines = group_lines(dblines)
if not series.single_speaker:
dblines = group_lines(dblines)
num_per_chunk = 100
chunks = chunked(dblines, num_per_chunk)
@ -168,6 +180,5 @@ def main() -> None:
episode.save()
clear_cache()
if __name__ == '__main__':
main()

View file

@ -73,7 +73,9 @@ class Line(BaseModel):
indexes = ((("episode", "order"), True),)
def __str__(self) -> str:
return f"<Line: {self.pk}>"
if self.is_dirty():
return f"<Line: {self.order} (dirty)>"
return f"<Line: {self.id}>"
class Phrase(BaseModel):

View file

@ -16,13 +16,16 @@ select text, char_length(phrase.text) as len
from phrase
order by len desc;
-- delete
-- from phrase;
-- delete from line;
select e.pretty_title, text,char_length(line.text) as len from line join episode e on e.id = line.episode_id order by len desc;
-- update episode
-- set text_imported= False;
delete
from phrase;
delete from line;
update episode
set text_imported= False, phrases_imported=False;
update person set color=null;