1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-19 15:23:44 +02:00

merge following lines by the same speaker

This commit is contained in:
Lukas Winkler 2021-07-15 22:24:40 +02:00
parent f84b9be8b7
commit 10a46f5b43
Signed by: lukas
GPG key ID: 54DE4D798D244853
3 changed files with 41 additions and 3 deletions

View file

@ -1,7 +1,8 @@
import os
import re
from html import unescape
from typing import List, Optional, Set
from itertools import groupby
from typing import List, Optional, Set, Union
from alive_progress import alive_bar
from peewee import fn, chunked
@ -29,6 +30,33 @@ def add_to_text(text: str, add: str) -> str:
return add
def line_key(line: Line) -> Union[str, Line]:
if line.ismeta or line.isnote:
return line
return line.person
def group_lines(dblines: List[Line]) -> List[Line]:
final_lines = []
order = 0
for _, group in groupby(dblines, key=line_key):
group = list(group)
first_line = group[0]
dbline = Line()
dbline.text = " ".join([line.text for line in group])
dbline.search_text = fn.to_tsvector('english', dbline.text)
dbline.person = first_line.person
dbline.starttime = first_line.starttime
dbline.endtime = group[-1].endtime
dbline.episode = first_line.episode
dbline.isnote = first_line.isnote
dbline.ismeta = first_line.ismeta
dbline.order = order
order += 1
final_lines.append(dbline)
return final_lines
def insert_subtitle(text: str, person: Optional[Person], subline: Subtitle, episode: Episode, order: int,
isnote: bool = False, ismeta: bool = False) -> Line:
dbline = Line()
@ -126,6 +154,9 @@ def main() -> None:
dblines.append(insert_subtitle(text, person, subline, episode, order=i))
text = ""
i += 1
dblines = group_lines(dblines)
num_per_chunk = 100
chunks = chunked(dblines, num_per_chunk)
with alive_bar(len(dblines) // num_per_chunk + 1) as bar:

View file

@ -1,7 +1,7 @@
from datetime import datetime
from peewee import IntegerField, CharField, BooleanField, ForeignKeyField, DateTimeField, \
DateField
DateField, TextField
from playhouse.postgres_ext import TSVectorField
from app import flask_db
@ -59,7 +59,7 @@ class Person(BaseModel):
class Line(BaseModel):
text = CharField()
text = TextField()
search_text = TSVectorField()
person = ForeignKeyField(Person, backref="lines", null=True, on_delete="CASCADE")
isnote = BooleanField(default=False)

View file

@ -19,6 +19,10 @@ order by len desc;
-- delete
-- from phrase;
delete from line;
update episode set text_imported=False;
EXPLAIN analyse
SELECT text, sum(count) as total_count
FROM phrase
@ -76,3 +80,6 @@ WHERE ((("t1"."search_text" @@ websearch_to_tsquery('english', 'house')) AND ("t
("t3"."season" = 1))
ORDER BY rank DESC
LIMIT 20;
SELECT * FROM ts_stat('SELECT search_text from line') order by nentry desc limit 500;