better line grouping

2024-09-19 15:23:44 +02:00 · 2021-07-16 11:51:05 +02:00 · 2021-07-16 11:51:05 +02:00 · 7f44dd04c6
commit 7f44dd04c6
parent 311f4225a5
3 changed files with 43 additions and 27 deletions
--- a/import.py
+++ b/import.py
@ -1,7 +1,6 @@
 import os
 import re
 from html import unescape
-from itertools import groupby
 from typing import List, Optional, Set, Union

 from alive_progress import alive_bar
@ -39,8 +38,26 @@ def line_key(line: Line) -> Union[str, Line]:
 def group_lines(dblines: List[Line]) -> List[Line]:
    final_lines = []
    order = 0
-    for _, group in groupby(dblines, key=line_key):
-        group = list(group)
+
+    index = 0
+    groups = []
+    group = []
+    last_key = None
+
+    while index < len(dblines):
+        line = dblines[index]
+        key = line_key(line)
+        if last_key != key and group:
+            groups.append(group)
+            group = []
+        last_key = key
+        group.append(line)
+        if line.text[-1] in [".", "!", "?"] and group:
+            groups.append(group)
+            group = []
+        index += 1
+
+    for group in groups:
        first_line = group[0]
        dbline = Line()
        dbline.text = " ".join([line.text for line in group])
@ -111,7 +128,17 @@ def main() -> None:
                            line = line[1:]
                        if ":" not in line:
                            text = add_to_text(text, line)
+                            if text.startswith("(") and text.endswith(")"):
+                                dblines.append(insert_subtitle(text, None, subline, episode, isnote=True, order=i))
+                                i += 1
+                                text = ""
+
+                            if text.startswith("[") and text.endswith("]"):
+                                dblines.append(insert_subtitle(text, None, subline, episode, ismeta=True, order=i))
+                                text = ""
+                                i += 1
                            continue
+
                        name, resttext = line.split(":", maxsplit=1)
                        if is_invalid_name(name) or not name[-1].isupper():
                            text = add_to_text(text, line)
@ -122,22 +149,6 @@ def main() -> None:
                            i += 1
                            text = ""

-                        if text.startswith("(") and text.endswith(")"):
-                            text = add_to_text(text, line)
-                            person = None
-                            dblines.append(insert_subtitle(text, person, subline, episode, isnote=True, order=i))
-                            i += 1
-                            text = ""
-                            continue
-
-                        if text.startswith("[") and text.endswith("]"):
-                            text = add_to_text(text, line)
-                            person = None
-                            dblines.append(insert_subtitle(text, person, subline, episode, ismeta=True, order=i))
-                            text = ""
-                            i += 1
-                            continue
-
                        people = []
                        name = name.lower()
                        for word in re.split('[,&/]|and| an ', name):
@ -155,7 +166,8 @@ def main() -> None:
                            text = ""
                            i += 1

-                dblines = group_lines(dblines)
+                if not series.single_speaker:
+                    dblines = group_lines(dblines)

                num_per_chunk = 100
                chunks = chunked(dblines, num_per_chunk)
@ -168,6 +180,5 @@ def main() -> None:
                episode.save()
            clear_cache()

-
 if __name__ == '__main__':
    main()
--- a/models.py
+++ b/models.py
@ -73,7 +73,9 @@ class Line(BaseModel):
        indexes = ((("episode", "order"), True),)

    def __str__(self) -> str:
-        return f"<Line: {self.pk}>"
+        if self.is_dirty():
+            return f"<Line: {self.order} (dirty)>"
+        return f"<Line: {self.id}>"


 class Phrase(BaseModel):
--- a/tests.sql
+++ b/tests.sql
@ -16,13 +16,16 @@ select text, char_length(phrase.text) as len
 from phrase
 order by len desc;

-- delete
-- from phrase;

-- delete from line;
+select e.pretty_title, text,char_length(line.text) as len from line join episode e on e.id = line.episode_id order by len desc;

-- update episode
-- set text_imported= False;
+delete
+from phrase;
+
+delete from line;
+
+update episode
+set text_imported= False, phrases_imported=False;

 update person set color=null;