mirror of
https://github.com/Findus23/cr-search.git
synced 2024-09-19 15:23:44 +02:00
better line grouping
This commit is contained in:
parent
311f4225a5
commit
7f44dd04c6
3 changed files with 43 additions and 27 deletions
53
import.py
53
import.py
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
import re
|
||||
from html import unescape
|
||||
from itertools import groupby
|
||||
from typing import List, Optional, Set, Union
|
||||
|
||||
from alive_progress import alive_bar
|
||||
|
@ -39,8 +38,26 @@ def line_key(line: Line) -> Union[str, Line]:
|
|||
def group_lines(dblines: List[Line]) -> List[Line]:
|
||||
final_lines = []
|
||||
order = 0
|
||||
for _, group in groupby(dblines, key=line_key):
|
||||
group = list(group)
|
||||
|
||||
index = 0
|
||||
groups = []
|
||||
group = []
|
||||
last_key = None
|
||||
|
||||
while index < len(dblines):
|
||||
line = dblines[index]
|
||||
key = line_key(line)
|
||||
if last_key != key and group:
|
||||
groups.append(group)
|
||||
group = []
|
||||
last_key = key
|
||||
group.append(line)
|
||||
if line.text[-1] in [".", "!", "?"] and group:
|
||||
groups.append(group)
|
||||
group = []
|
||||
index += 1
|
||||
|
||||
for group in groups:
|
||||
first_line = group[0]
|
||||
dbline = Line()
|
||||
dbline.text = " ".join([line.text for line in group])
|
||||
|
@ -111,7 +128,17 @@ def main() -> None:
|
|||
line = line[1:]
|
||||
if ":" not in line:
|
||||
text = add_to_text(text, line)
|
||||
if text.startswith("(") and text.endswith(")"):
|
||||
dblines.append(insert_subtitle(text, None, subline, episode, isnote=True, order=i))
|
||||
i += 1
|
||||
text = ""
|
||||
|
||||
if text.startswith("[") and text.endswith("]"):
|
||||
dblines.append(insert_subtitle(text, None, subline, episode, ismeta=True, order=i))
|
||||
text = ""
|
||||
i += 1
|
||||
continue
|
||||
|
||||
name, resttext = line.split(":", maxsplit=1)
|
||||
if is_invalid_name(name) or not name[-1].isupper():
|
||||
text = add_to_text(text, line)
|
||||
|
@ -122,22 +149,6 @@ def main() -> None:
|
|||
i += 1
|
||||
text = ""
|
||||
|
||||
if text.startswith("(") and text.endswith(")"):
|
||||
text = add_to_text(text, line)
|
||||
person = None
|
||||
dblines.append(insert_subtitle(text, person, subline, episode, isnote=True, order=i))
|
||||
i += 1
|
||||
text = ""
|
||||
continue
|
||||
|
||||
if text.startswith("[") and text.endswith("]"):
|
||||
text = add_to_text(text, line)
|
||||
person = None
|
||||
dblines.append(insert_subtitle(text, person, subline, episode, ismeta=True, order=i))
|
||||
text = ""
|
||||
i += 1
|
||||
continue
|
||||
|
||||
people = []
|
||||
name = name.lower()
|
||||
for word in re.split('[,&/]|and| an ', name):
|
||||
|
@ -155,7 +166,8 @@ def main() -> None:
|
|||
text = ""
|
||||
i += 1
|
||||
|
||||
dblines = group_lines(dblines)
|
||||
if not series.single_speaker:
|
||||
dblines = group_lines(dblines)
|
||||
|
||||
num_per_chunk = 100
|
||||
chunks = chunked(dblines, num_per_chunk)
|
||||
|
@ -168,6 +180,5 @@ def main() -> None:
|
|||
episode.save()
|
||||
clear_cache()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -73,7 +73,9 @@ class Line(BaseModel):
|
|||
indexes = ((("episode", "order"), True),)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"<Line: {self.pk}>"
|
||||
if self.is_dirty():
|
||||
return f"<Line: {self.order} (dirty)>"
|
||||
return f"<Line: {self.id}>"
|
||||
|
||||
|
||||
class Phrase(BaseModel):
|
||||
|
|
13
tests.sql
13
tests.sql
|
@ -16,13 +16,16 @@ select text, char_length(phrase.text) as len
|
|||
from phrase
|
||||
order by len desc;
|
||||
|
||||
-- delete
|
||||
-- from phrase;
|
||||
|
||||
-- delete from line;
|
||||
select e.pretty_title, text,char_length(line.text) as len from line join episode e on e.id = line.episode_id order by len desc;
|
||||
|
||||
-- update episode
|
||||
-- set text_imported= False;
|
||||
delete
|
||||
from phrase;
|
||||
|
||||
delete from line;
|
||||
|
||||
update episode
|
||||
set text_imported= False, phrases_imported=False;
|
||||
|
||||
update person set color=null;
|
||||
|
||||
|
|
Loading…
Reference in a new issue