mirror of
https://github.com/Findus23/cr-search.git
synced 2024-09-19 15:23:44 +02:00
merge following lines by the same speaker
This commit is contained in:
parent
f84b9be8b7
commit
10a46f5b43
3 changed files with 41 additions and 3 deletions
33
import.py
33
import.py
|
@ -1,7 +1,8 @@
|
|||
import os
|
||||
import re
|
||||
from html import unescape
|
||||
from typing import List, Optional, Set
|
||||
from itertools import groupby
|
||||
from typing import List, Optional, Set, Union
|
||||
|
||||
from alive_progress import alive_bar
|
||||
from peewee import fn, chunked
|
||||
|
@ -29,6 +30,33 @@ def add_to_text(text: str, add: str) -> str:
|
|||
return add
|
||||
|
||||
|
||||
def line_key(line: Line) -> Union[str, Line]:
|
||||
if line.ismeta or line.isnote:
|
||||
return line
|
||||
return line.person
|
||||
|
||||
|
||||
def group_lines(dblines: List[Line]) -> List[Line]:
|
||||
final_lines = []
|
||||
order = 0
|
||||
for _, group in groupby(dblines, key=line_key):
|
||||
group = list(group)
|
||||
first_line = group[0]
|
||||
dbline = Line()
|
||||
dbline.text = " ".join([line.text for line in group])
|
||||
dbline.search_text = fn.to_tsvector('english', dbline.text)
|
||||
dbline.person = first_line.person
|
||||
dbline.starttime = first_line.starttime
|
||||
dbline.endtime = group[-1].endtime
|
||||
dbline.episode = first_line.episode
|
||||
dbline.isnote = first_line.isnote
|
||||
dbline.ismeta = first_line.ismeta
|
||||
dbline.order = order
|
||||
order += 1
|
||||
final_lines.append(dbline)
|
||||
return final_lines
|
||||
|
||||
|
||||
def insert_subtitle(text: str, person: Optional[Person], subline: Subtitle, episode: Episode, order: int,
|
||||
isnote: bool = False, ismeta: bool = False) -> Line:
|
||||
dbline = Line()
|
||||
|
@ -126,6 +154,9 @@ def main() -> None:
|
|||
dblines.append(insert_subtitle(text, person, subline, episode, order=i))
|
||||
text = ""
|
||||
i += 1
|
||||
|
||||
dblines = group_lines(dblines)
|
||||
|
||||
num_per_chunk = 100
|
||||
chunks = chunked(dblines, num_per_chunk)
|
||||
with alive_bar(len(dblines) // num_per_chunk + 1) as bar:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from datetime import datetime
|
||||
|
||||
from peewee import IntegerField, CharField, BooleanField, ForeignKeyField, DateTimeField, \
|
||||
DateField
|
||||
DateField, TextField
|
||||
from playhouse.postgres_ext import TSVectorField
|
||||
|
||||
from app import flask_db
|
||||
|
@ -59,7 +59,7 @@ class Person(BaseModel):
|
|||
|
||||
|
||||
class Line(BaseModel):
|
||||
text = CharField()
|
||||
text = TextField()
|
||||
search_text = TSVectorField()
|
||||
person = ForeignKeyField(Person, backref="lines", null=True, on_delete="CASCADE")
|
||||
isnote = BooleanField(default=False)
|
||||
|
|
|
@ -19,6 +19,10 @@ order by len desc;
|
|||
-- delete
|
||||
-- from phrase;
|
||||
|
||||
delete from line;
|
||||
|
||||
update episode set text_imported=False;
|
||||
|
||||
EXPLAIN analyse
|
||||
SELECT text, sum(count) as total_count
|
||||
FROM phrase
|
||||
|
@ -76,3 +80,6 @@ WHERE ((("t1"."search_text" @@ websearch_to_tsquery('english', 'house')) AND ("t
|
|||
("t3"."season" = 1))
|
||||
ORDER BY rank DESC
|
||||
LIMIT 20;
|
||||
|
||||
|
||||
SELECT * FROM ts_stat('SELECT search_text from line') order by nentry desc limit 500;
|
||||
|
|
Loading…
Reference in a new issue