From 4b3e829deed878b15024942190c849058c8f0045 Mon Sep 17 00:00:00 2001 From: Lukas Winkler Date: Wed, 7 Jul 2021 17:27:21 +0200 Subject: [PATCH] add redis caching and better database --- app.py | 9 ++++++- benchmark.py | 16 ++++++------ createdb.py | 3 ++- fetch.py | 4 +-- import.py | 6 +++-- models.py | 12 +++------ phrases.py | 5 +++- poetry.lock | 67 ++++++++++++++++++++++++++++++++++++++++++-------- pyproject.toml | 2 ++ server.py | 8 ++++-- utils.py | 5 ++++ 11 files changed, 103 insertions(+), 34 deletions(-) diff --git a/app.py b/app.py index d24cd40..de7674c 100644 --- a/app.py +++ b/app.py @@ -1,4 +1,5 @@ from flask import Flask +from flask_caching import Cache from playhouse.flask_utils import FlaskDB from playhouse.pool import PooledPostgresqlDatabase @@ -9,15 +10,21 @@ DATABASE = PooledPostgresqlDatabase(**config.dbauth) if config.sentryDSN: import sentry_sdk from sentry_sdk.integrations.flask import FlaskIntegration + sentry_sdk.init( dsn=config.sentryDSN, integrations=[FlaskIntegration()] ) +CACHE_TYPE = "RedisCache" + +if config.production: + CACHE_REDIS_URL = "unix:///run/redis-crsearch/redis-server.sock" + # Create a Flask WSGI app and configure it using values from the module. app = Flask(__name__) app.config.from_object(__name__) - +cache = Cache(app) flask_db = FlaskDB(app) db = flask_db.database diff --git a/benchmark.py b/benchmark.py index 5dbb42e..cf7c578 100644 --- a/benchmark.py +++ b/benchmark.py @@ -7,12 +7,12 @@ from alive_progress import alive_bar from peewee import SelectQuery from psycopg2._psycopg import cursor -from models import db -from server import search, suggest +from app import db +from server import search, suggest, exact_search def benchmark_query(query: SelectQuery, filename: str = None) -> Tuple[float, float]: - query, params = test_search.sql() + query, params = query.sql() query = "EXPLAIN (ANALYZE, COSTS, VERBOSE, BUFFERS, FORMAT JSON) " + query @@ -45,12 +45,14 @@ def statistics(query: SelectQuery, filename: str, repeats: int = 500) -> None: print(mean(execution_times), stdev(execution_times)) -test_search = search("hello", 1000, 1, 200) +test_search = search("hello", 1000, "campaign2", 200) statistics(test_search, filename="search_hello") -test_search = search("a very long search query with a lot of stop word", 1000, 1, 200) +test_search = exact_search("hello", 1000, "campaign2", 200) +statistics(test_search, filename="exact_search", repeats=50) +test_search = search("a very long search query with a lot of stop word", 1000, "campaign2", 200) statistics(test_search, filename="search_long") -test_search = suggest("gnoll", 1000, 1) +test_search = suggest("gnoll", 1000, "campaign2") statistics(test_search, filename="suggest_simple") -test_search = suggest("gu", 1000, 1) +test_search = suggest("gu", 1000, "campaign2") statistics(test_search, filename="suggest_two_letter", repeats=100) diff --git a/createdb.py b/createdb.py index 73c3da8..942a88d 100644 --- a/createdb.py +++ b/createdb.py @@ -1,6 +1,7 @@ from sys import argv -from models import db, Series, Phrase, Episode, Person, Line +from app import db +from models import Series, Phrase, Episode, Person, Line def confirm(message: str) -> None: diff --git a/fetch.py b/fetch.py index caccc78..a2b0c07 100644 --- a/fetch.py +++ b/fetch.py @@ -12,7 +12,7 @@ from peewee import DoesNotExist from data import series_data from models import Episode, Series, Line, Phrase -from utils import srtdir, pretty_title, title_to_episodenumber +from utils import srtdir, pretty_title, title_to_episodenumber, clear_cache static_path = Path("static") @@ -112,7 +112,7 @@ def main(args) -> None: except FileNotFoundError: e.downloaded = False e.save() - + clear_cache() if __name__ == '__main__': parser = argparse.ArgumentParser(description="fetch episode data from YouTube") diff --git a/import.py b/import.py index 02d8847..9ed59c3 100644 --- a/import.py +++ b/import.py @@ -7,9 +7,10 @@ from alive_progress import alive_bar from peewee import fn, chunked from srt import parse, Subtitle -from models import Person, Line, Episode, db, Series +from app import db +from models import Person, Line, Episode, Series from typo import fix_typo -from utils import td_to_milliseconds, srtdir, episode_speaker +from utils import td_to_milliseconds, srtdir, episode_speaker, clear_cache def is_invalid_name(name: str) -> bool: @@ -134,6 +135,7 @@ def main() -> None: episode.text_imported = True episode.save() + clear_cache() if __name__ == '__main__': diff --git a/models.py b/models.py index a5c2b05..8196a93 100644 --- a/models.py +++ b/models.py @@ -1,18 +1,14 @@ from datetime import datetime -from peewee import PostgresqlDatabase, Model, IntegerField, CharField, BooleanField, ForeignKeyField, DateTimeField, \ +from peewee import IntegerField, CharField, BooleanField, ForeignKeyField, DateTimeField, \ DateField from playhouse.postgres_ext import TSVectorField -from config import dbauth - -db = PostgresqlDatabase(**dbauth) -db.connect() +from app import flask_db -class BaseModel(Model): - class Meta: - database = db +class BaseModel(flask_db.Model): + ... class Series(BaseModel): diff --git a/phrases.py b/phrases.py index d8c56ba..0d1f5ad 100644 --- a/phrases.py +++ b/phrases.py @@ -9,8 +9,10 @@ from spacy.lang.en import Language from spacy.tokens.span import Span from spacy.tokens.token import Token -from models import Episode, Line, db, Phrase +from app import db +from models import Episode, Line, Phrase from stopwords import STOP_WORDS +from utils import clear_cache os.nice(15) @@ -81,3 +83,4 @@ for episode in Episode.select().where((Episode.phrases_imported == False) & (Epi episode.phrases_imported = True episode.save() + clear_cache() diff --git a/poetry.lock b/poetry.lock index 0f39002..6b963cc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -57,6 +57,14 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "colorama" +version = "0.4.4" +description = "Cross-platform colored terminal text." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + [[package]] name = "cymem" version = "2.0.5" @@ -79,6 +87,7 @@ spacy = ">=3.0.0,<3.1.0" [package.source] type = "url" url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0.tar.gz" + [[package]] name = "flask" version = "2.0.1" @@ -97,6 +106,17 @@ Werkzeug = ">=2.0" async = ["asgiref (>=3.2)"] dotenv = ["python-dotenv"] +[[package]] +name = "flask-caching" +version = "1.10.1" +description = "Adds caching support to your Flask application" +category = "main" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +Flask = "*" + [[package]] name = "gunicorn" version = "20.1.0" @@ -243,6 +263,17 @@ category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "redis" +version = "3.5.3" +description = "Python client for Redis key-value store" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.extras] +hiredis = ["hiredis (>=0.1.3)"] + [[package]] name = "requests" version = "2.25.1" @@ -263,7 +294,7 @@ socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] [[package]] name = "sentry-sdk" -version = "1.1.0" +version = "1.2.0" description = "Python client for Sentry (https://sentry.io)" category = "main" optional = false @@ -284,6 +315,7 @@ chalice = ["chalice (>=1.16.0)"] django = ["django (>=1.8)"] falcon = ["falcon (>=1.4)"] flask = ["flask (>=0.11)", "blinker (>=1.1)"] +httpx = ["httpx (>=0.16.0)"] pure_eval = ["pure-eval", "executing", "asttokens"] pyspark = ["pyspark (>=2.4.4)"] rq = ["rq (>=0.6)"] @@ -367,7 +399,7 @@ transformers = ["spacy-transformers (>=1.0.1,<1.1.0)"] [[package]] name = "spacy-legacy" -version = "3.0.6" +version = "3.0.7" description = "Legacy registered functions for spaCy backwards compatibility" category = "main" optional = false @@ -429,12 +461,15 @@ torch = ["torch (>=1.5.0)"] [[package]] name = "tqdm" -version = "4.61.1" +version = "4.61.2" description = "Fast, Extensible Progress Meter" category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [package.extras] dev = ["py-make (>=0.1.0)", "twine", "wheel"] notebook = ["ipywidgets (>=6)"] @@ -500,7 +535,7 @@ python-versions = "*" [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "86eee8fba06068f62881fc46114eb1861a7eb3e4130c7ac0f9ffc457645b7157" +content-hash = "08d267706eac1a2c5f03bac0f5a705d627399f2e75e64cbd5f98c1a34fa0a86c" [metadata.files] alive-progress = [ @@ -541,6 +576,10 @@ click = [ {file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"}, {file = "click-7.1.2.tar.gz", hash = "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a"}, ] +colorama = [ + {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, + {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, +] cymem = [ {file = "cymem-2.0.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9d72d69f7a62a280199c3aa7bc550685c47d6d0689b2d299e6492253b86d2437"}, {file = "cymem-2.0.5-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:8ea57e6923f40eb51012352161bb5707c14a5a5ce901ff72021e59df06221655"}, @@ -561,6 +600,10 @@ flask = [ {file = "Flask-2.0.1-py3-none-any.whl", hash = "sha256:a6209ca15eb63fc9385f38e452704113d679511d9574d09b2cf9183ae7d20dc9"}, {file = "Flask-2.0.1.tar.gz", hash = "sha256:1c4c257b1892aec1398784c63791cbaa43062f1f7aeb555c4da961b20ee68f55"}, ] +flask-caching = [ + {file = "Flask-Caching-1.10.1.tar.gz", hash = "sha256:cf19b722fcebc2ba03e4ae7c55b532ed53f0cbf683ce36fafe5e881789a01c00"}, + {file = "Flask_Caching-1.10.1-py3-none-any.whl", hash = "sha256:bcda8acbc7508e31e50f63e9b1ab83185b446f6b6318bd9dd1d45626fba2e903"}, +] gunicorn = [ {file = "gunicorn-20.1.0.tar.gz", hash = "sha256:e0a968b5ba15f8a328fdfd7ab1fcb5af4470c28aaf7e55df02a99bc13138e6e8"}, ] @@ -730,13 +773,17 @@ pyparsing = [ {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, ] +redis = [ + {file = "redis-3.5.3-py2.py3-none-any.whl", hash = "sha256:432b788c4530cfe16d8d943a09d40ca6c16149727e4afe8c2c9d5580c59d9f24"}, + {file = "redis-3.5.3.tar.gz", hash = "sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2"}, +] requests = [ {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"}, {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"}, ] sentry-sdk = [ - {file = "sentry-sdk-1.1.0.tar.gz", hash = "sha256:c1227d38dca315ba35182373f129c3e2722e8ed999e52584e6aca7d287870739"}, - {file = "sentry_sdk-1.1.0-py2.py3-none-any.whl", hash = "sha256:c7d380a21281e15be3d9f67a3c4fbb4f800c481d88ff8d8931f39486dd7b4ada"}, + {file = "sentry-sdk-1.2.0.tar.gz", hash = "sha256:9907adbdd30a55b818914512cc143e6beae0bb3ba78b2649f4b079752eb0e424"}, + {file = "sentry_sdk-1.2.0-py2.py3-none-any.whl", hash = "sha256:593f6118cc6d3eba4786c3f802567c937bdb81b3c8e90436e8a29e84071c6936"}, ] setproctitle = [ {file = "setproctitle-1.2.2-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:9106bcbacae534b6f82955b176723f1b2ca6514518aab44dffec05a583f8dca8"}, @@ -781,8 +828,8 @@ spacy = [ {file = "spacy-3.0.6.tar.gz", hash = "sha256:5628ab89f1f568099c880b12a9c37f4ece29ab89260660cfdf728c02711879c5"}, ] spacy-legacy = [ - {file = "spacy-legacy-3.0.6.tar.gz", hash = "sha256:76d102a840cae96dbcc2637e5baca0c0c001e6c43b5879112c6b4431eb0efbdb"}, - {file = "spacy_legacy-3.0.6-py2.py3-none-any.whl", hash = "sha256:3029826f8cdd0da11331917a389d53d586f274837f35ed24b9263297a4574f50"}, + {file = "spacy-legacy-3.0.7.tar.gz", hash = "sha256:c46a23d8eb8b8e95a3fd087cce6fb91d090f5c6bd2710159035f08a1fdd982e4"}, + {file = "spacy_legacy-3.0.7-py2.py3-none-any.whl", hash = "sha256:e53fea9f11a67c1b6484062bef1a11484871de3132ffc77206f5e3e5ca9c92f4"}, ] srsly = [ {file = "srsly-2.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f7c3374184bfb1aa852bcb8e45747b02f2dde0ebe62b4ddf4b0141affeab32e1"}, @@ -818,8 +865,8 @@ thinc = [ {file = "thinc-8.0.7.tar.gz", hash = "sha256:7c4382f22a5a3864b88cafe6e64f7a69f64b32147575b5f1e6f2211a8adb11f9"}, ] tqdm = [ - {file = "tqdm-4.61.1-py2.py3-none-any.whl", hash = "sha256:aa0c29f03f298951ac6318f7c8ce584e48fa22ec26396e6411e43d038243bdb2"}, - {file = "tqdm-4.61.1.tar.gz", hash = "sha256:24be966933e942be5f074c29755a95b315c69a91f839a29139bf26ffffe2d3fd"}, + {file = "tqdm-4.61.2-py2.py3-none-any.whl", hash = "sha256:5aa445ea0ad8b16d82b15ab342de6b195a722d75fc1ef9934a46bba6feafbc64"}, + {file = "tqdm-4.61.2.tar.gz", hash = "sha256:8bb94db0d4468fea27d004a0f1d1c02da3cdedc00fe491c0de986b76a04d6b0a"}, ] typer = [ {file = "typer-0.3.2-py3-none-any.whl", hash = "sha256:ba58b920ce851b12a2d790143009fa00ac1d05b3ff3257061ff69dbdfc3d161b"}, diff --git a/pyproject.toml b/pyproject.toml index d1be82a..b3b70b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,8 @@ youtube_dl = "^2021.1.3" setproctitle = "^1.2.1" sentry-sdk = {extras = ["flask"], version = "^1.0.0"} blinker = "^1.4" # tmp workaround to make flask-sentry work +Flask-Caching = "^1.10.1" +redis = "^3.5.3" [tool.poetry.dev-dependencies] diff --git a/server.py b/server.py index 96ed0a2..78b98b0 100644 --- a/server.py +++ b/server.py @@ -6,7 +6,7 @@ from playhouse.postgres_ext import TS_MATCH from playhouse.shortcuts import model_to_dict from psycopg2._psycopg import cursor -from app import app +from app import app, db, cache from models import * @@ -83,6 +83,7 @@ def api_search(): until = 1000 series = request.args.get('series') exact = request.args.get('exact', False) + exact = False # don't allow exact searches if not query or not until or not series: return "no suggest query", 400 if len(query) > 50: @@ -110,7 +111,8 @@ def api_search(): d: Line ri = 0 for d in results: - entry = model_to_dict(d, extra_attrs=[] if exact else ["rank"] , exclude=global_excludes + [Episode.subtitle_hash]) + entry = model_to_dict(d, extra_attrs=[] if exact else ["rank"], + exclude=global_excludes + [Episode.subtitle_hash]) if not exact: entry["rank"] = float(entry["rank"]) data.append({"centerID": d.id, "resultID": ri, "offset": 1, "lines": [entry]}) @@ -145,6 +147,7 @@ def api_expand(): @app.route("/api/series") +@cache.cached(timeout=60 * 60 * 24) def series(): series_list = [] @@ -162,6 +165,7 @@ def series(): @app.route("/api/episodes") +@cache.cached(timeout=60 * 60 * 24) def api_episodes(): all_series: List[Series] = Series.select().order_by(Series.id) data = [] diff --git a/utils.py b/utils.py index 3ec279c..c91bb6a 100644 --- a/utils.py +++ b/utils.py @@ -3,6 +3,7 @@ from datetime import timedelta from pathlib import Path from typing import Optional +from app import cache from data import single_speaker srtdir = Path("./data/subtitles/") @@ -59,3 +60,7 @@ def pretty_title(title: str) -> str: return title.split("-")[0].strip() else: return title.strip() + + +def clear_cache() -> None: + cache.clear()