1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-11 06:03:45 +02:00

add redis caching and better database

This commit is contained in:
Lukas Winkler 2021-07-07 17:27:21 +02:00
parent 76dd2177a6
commit 4b3e829dee
Signed by: lukas
GPG key ID: 54DE4D798D244853
11 changed files with 103 additions and 34 deletions

9
app.py
View file

@ -1,4 +1,5 @@
from flask import Flask
from flask_caching import Cache
from playhouse.flask_utils import FlaskDB
from playhouse.pool import PooledPostgresqlDatabase
@ -9,15 +10,21 @@ DATABASE = PooledPostgresqlDatabase(**config.dbauth)
if config.sentryDSN:
import sentry_sdk
from sentry_sdk.integrations.flask import FlaskIntegration
sentry_sdk.init(
dsn=config.sentryDSN,
integrations=[FlaskIntegration()]
)
CACHE_TYPE = "RedisCache"
if config.production:
CACHE_REDIS_URL = "unix:///run/redis-crsearch/redis-server.sock"
# Create a Flask WSGI app and configure it using values from the module.
app = Flask(__name__)
app.config.from_object(__name__)
cache = Cache(app)
flask_db = FlaskDB(app)
db = flask_db.database

View file

@ -7,12 +7,12 @@ from alive_progress import alive_bar
from peewee import SelectQuery
from psycopg2._psycopg import cursor
from models import db
from server import search, suggest
from app import db
from server import search, suggest, exact_search
def benchmark_query(query: SelectQuery, filename: str = None) -> Tuple[float, float]:
query, params = test_search.sql()
query, params = query.sql()
query = "EXPLAIN (ANALYZE, COSTS, VERBOSE, BUFFERS, FORMAT JSON) " + query
@ -45,12 +45,14 @@ def statistics(query: SelectQuery, filename: str, repeats: int = 500) -> None:
print(mean(execution_times), stdev(execution_times))
test_search = search("hello", 1000, 1, 200)
test_search = search("hello", 1000, "campaign2", 200)
statistics(test_search, filename="search_hello")
test_search = search("a very long search query with a lot of stop word", 1000, 1, 200)
test_search = exact_search("hello", 1000, "campaign2", 200)
statistics(test_search, filename="exact_search", repeats=50)
test_search = search("a very long search query with a lot of stop word", 1000, "campaign2", 200)
statistics(test_search, filename="search_long")
test_search = suggest("gnoll", 1000, 1)
test_search = suggest("gnoll", 1000, "campaign2")
statistics(test_search, filename="suggest_simple")
test_search = suggest("gu", 1000, 1)
test_search = suggest("gu", 1000, "campaign2")
statistics(test_search, filename="suggest_two_letter", repeats=100)

View file

@ -1,6 +1,7 @@
from sys import argv
from models import db, Series, Phrase, Episode, Person, Line
from app import db
from models import Series, Phrase, Episode, Person, Line
def confirm(message: str) -> None:

View file

@ -12,7 +12,7 @@ from peewee import DoesNotExist
from data import series_data
from models import Episode, Series, Line, Phrase
from utils import srtdir, pretty_title, title_to_episodenumber
from utils import srtdir, pretty_title, title_to_episodenumber, clear_cache
static_path = Path("static")
@ -112,7 +112,7 @@ def main(args) -> None:
except FileNotFoundError:
e.downloaded = False
e.save()
clear_cache()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="fetch episode data from YouTube")

View file

@ -7,9 +7,10 @@ from alive_progress import alive_bar
from peewee import fn, chunked
from srt import parse, Subtitle
from models import Person, Line, Episode, db, Series
from app import db
from models import Person, Line, Episode, Series
from typo import fix_typo
from utils import td_to_milliseconds, srtdir, episode_speaker
from utils import td_to_milliseconds, srtdir, episode_speaker, clear_cache
def is_invalid_name(name: str) -> bool:
@ -134,6 +135,7 @@ def main() -> None:
episode.text_imported = True
episode.save()
clear_cache()
if __name__ == '__main__':

View file

@ -1,18 +1,14 @@
from datetime import datetime
from peewee import PostgresqlDatabase, Model, IntegerField, CharField, BooleanField, ForeignKeyField, DateTimeField, \
from peewee import IntegerField, CharField, BooleanField, ForeignKeyField, DateTimeField, \
DateField
from playhouse.postgres_ext import TSVectorField
from config import dbauth
db = PostgresqlDatabase(**dbauth)
db.connect()
from app import flask_db
class BaseModel(Model):
class Meta:
database = db
class BaseModel(flask_db.Model):
...
class Series(BaseModel):

View file

@ -9,8 +9,10 @@ from spacy.lang.en import Language
from spacy.tokens.span import Span
from spacy.tokens.token import Token
from models import Episode, Line, db, Phrase
from app import db
from models import Episode, Line, Phrase
from stopwords import STOP_WORDS
from utils import clear_cache
os.nice(15)
@ -81,3 +83,4 @@ for episode in Episode.select().where((Episode.phrases_imported == False) & (Epi
episode.phrases_imported = True
episode.save()
clear_cache()

67
poetry.lock generated
View file

@ -57,6 +57,14 @@ category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
[[package]]
name = "colorama"
version = "0.4.4"
description = "Cross-platform colored terminal text."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
[[package]]
name = "cymem"
version = "2.0.5"
@ -79,6 +87,7 @@ spacy = ">=3.0.0,<3.1.0"
[package.source]
type = "url"
url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0.tar.gz"
[[package]]
name = "flask"
version = "2.0.1"
@ -97,6 +106,17 @@ Werkzeug = ">=2.0"
async = ["asgiref (>=3.2)"]
dotenv = ["python-dotenv"]
[[package]]
name = "flask-caching"
version = "1.10.1"
description = "Adds caching support to your Flask application"
category = "main"
optional = false
python-versions = ">=3.5"
[package.dependencies]
Flask = "*"
[[package]]
name = "gunicorn"
version = "20.1.0"
@ -243,6 +263,17 @@ category = "main"
optional = false
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
[[package]]
name = "redis"
version = "3.5.3"
description = "Python client for Redis key-value store"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
[package.extras]
hiredis = ["hiredis (>=0.1.3)"]
[[package]]
name = "requests"
version = "2.25.1"
@ -263,7 +294,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
[[package]]
name = "sentry-sdk"
version = "1.1.0"
version = "1.2.0"
description = "Python client for Sentry (https://sentry.io)"
category = "main"
optional = false
@ -284,6 +315,7 @@ chalice = ["chalice (>=1.16.0)"]
django = ["django (>=1.8)"]
falcon = ["falcon (>=1.4)"]
flask = ["flask (>=0.11)", "blinker (>=1.1)"]
httpx = ["httpx (>=0.16.0)"]
pure_eval = ["pure-eval", "executing", "asttokens"]
pyspark = ["pyspark (>=2.4.4)"]
rq = ["rq (>=0.6)"]
@ -367,7 +399,7 @@ transformers = ["spacy-transformers (>=1.0.1,<1.1.0)"]
[[package]]
name = "spacy-legacy"
version = "3.0.6"
version = "3.0.7"
description = "Legacy registered functions for spaCy backwards compatibility"
category = "main"
optional = false
@ -429,12 +461,15 @@ torch = ["torch (>=1.5.0)"]
[[package]]
name = "tqdm"
version = "4.61.1"
version = "4.61.2"
description = "Fast, Extensible Progress Meter"
category = "main"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[package.extras]
dev = ["py-make (>=0.1.0)", "twine", "wheel"]
notebook = ["ipywidgets (>=6)"]
@ -500,7 +535,7 @@ python-versions = "*"
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "86eee8fba06068f62881fc46114eb1861a7eb3e4130c7ac0f9ffc457645b7157"
content-hash = "08d267706eac1a2c5f03bac0f5a705d627399f2e75e64cbd5f98c1a34fa0a86c"
[metadata.files]
alive-progress = [
@ -541,6 +576,10 @@ click = [
{file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"},
{file = "click-7.1.2.tar.gz", hash = "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a"},
]
colorama = [
{file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
{file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
]
cymem = [
{file = "cymem-2.0.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9d72d69f7a62a280199c3aa7bc550685c47d6d0689b2d299e6492253b86d2437"},
{file = "cymem-2.0.5-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:8ea57e6923f40eb51012352161bb5707c14a5a5ce901ff72021e59df06221655"},
@ -561,6 +600,10 @@ flask = [
{file = "Flask-2.0.1-py3-none-any.whl", hash = "sha256:a6209ca15eb63fc9385f38e452704113d679511d9574d09b2cf9183ae7d20dc9"},
{file = "Flask-2.0.1.tar.gz", hash = "sha256:1c4c257b1892aec1398784c63791cbaa43062f1f7aeb555c4da961b20ee68f55"},
]
flask-caching = [
{file = "Flask-Caching-1.10.1.tar.gz", hash = "sha256:cf19b722fcebc2ba03e4ae7c55b532ed53f0cbf683ce36fafe5e881789a01c00"},
{file = "Flask_Caching-1.10.1-py3-none-any.whl", hash = "sha256:bcda8acbc7508e31e50f63e9b1ab83185b446f6b6318bd9dd1d45626fba2e903"},
]
gunicorn = [
{file = "gunicorn-20.1.0.tar.gz", hash = "sha256:e0a968b5ba15f8a328fdfd7ab1fcb5af4470c28aaf7e55df02a99bc13138e6e8"},
]
@ -730,13 +773,17 @@ pyparsing = [
{file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
{file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
]
redis = [
{file = "redis-3.5.3-py2.py3-none-any.whl", hash = "sha256:432b788c4530cfe16d8d943a09d40ca6c16149727e4afe8c2c9d5580c59d9f24"},
{file = "redis-3.5.3.tar.gz", hash = "sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2"},
]
requests = [
{file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"},
{file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"},
]
sentry-sdk = [
{file = "sentry-sdk-1.1.0.tar.gz", hash = "sha256:c1227d38dca315ba35182373f129c3e2722e8ed999e52584e6aca7d287870739"},
{file = "sentry_sdk-1.1.0-py2.py3-none-any.whl", hash = "sha256:c7d380a21281e15be3d9f67a3c4fbb4f800c481d88ff8d8931f39486dd7b4ada"},
{file = "sentry-sdk-1.2.0.tar.gz", hash = "sha256:9907adbdd30a55b818914512cc143e6beae0bb3ba78b2649f4b079752eb0e424"},
{file = "sentry_sdk-1.2.0-py2.py3-none-any.whl", hash = "sha256:593f6118cc6d3eba4786c3f802567c937bdb81b3c8e90436e8a29e84071c6936"},
]
setproctitle = [
{file = "setproctitle-1.2.2-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:9106bcbacae534b6f82955b176723f1b2ca6514518aab44dffec05a583f8dca8"},
@ -781,8 +828,8 @@ spacy = [
{file = "spacy-3.0.6.tar.gz", hash = "sha256:5628ab89f1f568099c880b12a9c37f4ece29ab89260660cfdf728c02711879c5"},
]
spacy-legacy = [
{file = "spacy-legacy-3.0.6.tar.gz", hash = "sha256:76d102a840cae96dbcc2637e5baca0c0c001e6c43b5879112c6b4431eb0efbdb"},
{file = "spacy_legacy-3.0.6-py2.py3-none-any.whl", hash = "sha256:3029826f8cdd0da11331917a389d53d586f274837f35ed24b9263297a4574f50"},
{file = "spacy-legacy-3.0.7.tar.gz", hash = "sha256:c46a23d8eb8b8e95a3fd087cce6fb91d090f5c6bd2710159035f08a1fdd982e4"},
{file = "spacy_legacy-3.0.7-py2.py3-none-any.whl", hash = "sha256:e53fea9f11a67c1b6484062bef1a11484871de3132ffc77206f5e3e5ca9c92f4"},
]
srsly = [
{file = "srsly-2.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f7c3374184bfb1aa852bcb8e45747b02f2dde0ebe62b4ddf4b0141affeab32e1"},
@ -818,8 +865,8 @@ thinc = [
{file = "thinc-8.0.7.tar.gz", hash = "sha256:7c4382f22a5a3864b88cafe6e64f7a69f64b32147575b5f1e6f2211a8adb11f9"},
]
tqdm = [
{file = "tqdm-4.61.1-py2.py3-none-any.whl", hash = "sha256:aa0c29f03f298951ac6318f7c8ce584e48fa22ec26396e6411e43d038243bdb2"},
{file = "tqdm-4.61.1.tar.gz", hash = "sha256:24be966933e942be5f074c29755a95b315c69a91f839a29139bf26ffffe2d3fd"},
{file = "tqdm-4.61.2-py2.py3-none-any.whl", hash = "sha256:5aa445ea0ad8b16d82b15ab342de6b195a722d75fc1ef9934a46bba6feafbc64"},
{file = "tqdm-4.61.2.tar.gz", hash = "sha256:8bb94db0d4468fea27d004a0f1d1c02da3cdedc00fe491c0de986b76a04d6b0a"},
]
typer = [
{file = "typer-0.3.2-py3-none-any.whl", hash = "sha256:ba58b920ce851b12a2d790143009fa00ac1d05b3ff3257061ff69dbdfc3d161b"},

View file

@ -18,6 +18,8 @@ youtube_dl = "^2021.1.3"
setproctitle = "^1.2.1"
sentry-sdk = {extras = ["flask"], version = "^1.0.0"}
blinker = "^1.4" # tmp workaround to make flask-sentry work
Flask-Caching = "^1.10.1"
redis = "^3.5.3"
[tool.poetry.dev-dependencies]

View file

@ -6,7 +6,7 @@ from playhouse.postgres_ext import TS_MATCH
from playhouse.shortcuts import model_to_dict
from psycopg2._psycopg import cursor
from app import app
from app import app, db, cache
from models import *
@ -83,6 +83,7 @@ def api_search():
until = 1000
series = request.args.get('series')
exact = request.args.get('exact', False)
exact = False # don't allow exact searches
if not query or not until or not series:
return "no suggest query", 400
if len(query) > 50:
@ -110,7 +111,8 @@ def api_search():
d: Line
ri = 0
for d in results:
entry = model_to_dict(d, extra_attrs=[] if exact else ["rank"] , exclude=global_excludes + [Episode.subtitle_hash])
entry = model_to_dict(d, extra_attrs=[] if exact else ["rank"],
exclude=global_excludes + [Episode.subtitle_hash])
if not exact:
entry["rank"] = float(entry["rank"])
data.append({"centerID": d.id, "resultID": ri, "offset": 1, "lines": [entry]})
@ -145,6 +147,7 @@ def api_expand():
@app.route("/api/series")
@cache.cached(timeout=60 * 60 * 24)
def series():
series_list = []
@ -162,6 +165,7 @@ def series():
@app.route("/api/episodes")
@cache.cached(timeout=60 * 60 * 24)
def api_episodes():
all_series: List[Series] = Series.select().order_by(Series.id)
data = []

View file

@ -3,6 +3,7 @@ from datetime import timedelta
from pathlib import Path
from typing import Optional
from app import cache
from data import single_speaker
srtdir = Path("./data/subtitles/")
@ -59,3 +60,7 @@ def pretty_title(title: str) -> str:
return title.split("-")[0].strip()
else:
return title.strip()
def clear_cache() -> None:
cache.clear()