cr-search/server.py

import random
from typing import List

from flask import request, jsonify, Response, abort
from peewee import fn, Alias, SQL, DoesNotExist, Expression, ModelSelect, JOIN
from playhouse.postgres_ext import TS_MATCH
from playhouse.shortcuts import model_to_dict
from psycopg2._psycopg import cursor

from app import app, db, cache
from models import *
# logger = logging.getLogger('peewee')
# logger.addHandler(logging.StreamHandler())
# logger.setLevel(logging.DEBUG)
from stats import TotalWords, MostCommonNounChunks, LongestNounChunks, LinesPerPerson
from suggestions import suggestions


def add_cors(response: Response) -> Response:
    header = response.headers
    header['Access-Control-Allow-Origin'] = '*'
    return response


def suggest(query: str, until: int, series: str, limit: int = 10) -> ModelSelect:
    return Phrase.select(Phrase.text, Alias(fn.SUM(Phrase.count), "total_count")).join(Episode).join(Series).where(
        (Episode.series.slug == series) &
        (Episode.episode_number <= until) &
        (Phrase.text.contains(query))
    ).group_by(Phrase.text).order_by(SQL("total_count DESC")).limit(limit)


def search(query: str, until: int, series: str, limit: int = 50) -> ModelSelect:
    a = Alias(fn.ts_rank_cd(Line.search_text, fn.websearch_to_tsquery('english', query), 1 + 4), "rank")

    return Line.select(Line, Person, Episode, Series, a).where(
        Expression(Line.search_text, TS_MATCH, fn.websearch_to_tsquery('english', query))
        &
        (Episode.episode_number <= until)
        &
        (Episode.series.slug == series)
    ).order_by(SQL("rank DESC")) \
        .join(Person, join_type=JOIN.FULL).switch(Line) \
        .join(Episode).join(Series) \
        .limit(limit)


def exact_search(query: str, until: int, series: str, limit: int = 50) -> ModelSelect:
    return Line.select(Line, Person, Episode, Series).where(
        (Episode.episode_number <= until)
        &
        (Episode.series.slug == series)
        &
        (Line.text.contains(query))
    ).order_by(Episode.video_number, Line.order) \
        .join(Person).switch(Line) \
        .join(Episode).join(Series) \
        .limit(limit)


global_excludes = [Line.search_text, Episode.phrases_imported, Episode.text_imported, Person.series, Episode.title]


@app.route("/api/suggest")
def api_question():
    query: str = request.args.get('query')
    until = request.args.get('until')
    if until == "-":
        until = 1000
    series = request.args.get('series')
    if not query or not until or not series:
        return "no suggest query", 400
    if len(query) > 500:
        return "too long query", 400
    cache_key = f"suggest_{until}_{series}_{query}"
    if len(query) < 3:
        result = cache.get(cache_key)
        if result:
            return jsonify(result)
    phrases = suggest(query, until, series)
    result = [p.text for p in phrases]
    if len(query) < 3:
        cache.set(cache_key, result, timeout=60 * 60 * 24 * 7)
    return jsonify(result)


@app.route("/api/search")
def api_search():
    query = request.args.get('query')
    until = request.args.get('until')
    if until == "-":
        until = 1000
    series = request.args.get('series')
    exact = request.args.get('exact', False)
    exact = False  # don't allow exact searches
    if not query or not until or not series:
        return "no search query", 400
    if len(query) > 500:
        return "too long query", 400

    if exact:
        results = exact_search(query, until, series)
    else:
        results = search(query, until, series)

        if len(results) == 0:
            result: cursor = db.execute_sql("select websearch_to_tsquery('english',%s)", [query])
            parsed = result.fetchone()[0]
            if not parsed:
                return jsonify({
                    "status": "warning",
                    "message": "Only stop words were used. Please try to add a less common word to the search."
                })
            else:
                resp: Response = jsonify({"status": "warning", "message": f"No results were found for {parsed}"})
                resp.status_code = 404
                return resp

    data = []
    d: Line
    ri = 0
    for d in results:
        entry = model_to_dict(d, extra_attrs=[] if exact else ["rank"],
                              exclude=global_excludes + [Episode.subtitle_hash])
        if not exact:
            entry["rank"] = float(entry["rank"])
        data.append({"centerID": d.id, "resultID": ri, "offset": 1, "lines": [entry]})
        ri += 1

    return jsonify(data)


@app.route("/api/expand")
def api_expand():
    center_id = request.args.get('centerID')
    offset = int(request.args.get('offset', 1))
    if not center_id:
        return "no central line ID", 400

    try:
        center: Line = Line.select().where(Line.id == center_id).get()

    except DoesNotExist:
        return "not found", 404

    lines = Line.select().where(
        (Line.episode == center.episode) & (Line.order << [center.order - offset, center.order + offset])
    )
    l: Line
    data = []
    for l in lines:
        entry = model_to_dict(l, exclude=global_excludes)
        data.append(entry)

    return jsonify(data)


@app.route("/api/series")
@cache.cached(timeout=60 * 60 * 24)
def series():
    series_list = []
    for series in Series.select().order_by(Series.order):
        last_episode: Episode = Episode.select().where(Episode.series == series).order_by(
            Episode.upload_date.desc()).limit(
            1).get()
        series_data = model_to_dict(series, exclude=[Series.order])
        series_data["last_upload"] = last_episode.upload_date.strftime("%Y-%m-%d")
        series_data["length"] = Episode.select().where(Episode.series == series).count()
        series_list.append(series_data)
    return jsonify({
        "series": series_list
    })


@app.route("/api/episodes")
@cache.cached(timeout=60 * 60 * 24)
def api_episodes():
    all_series: List[Series] = Series.select().order_by(Series.order)
    data = []
    for series in all_series:

        episodes: List[Episode] = Episode.select().where(Episode.series == series).order_by(Episode.video_number)

        series_data = []
        for episode in episodes:
            entry = model_to_dict(episode, exclude=[Episode.series, Episode.title])
            if entry["upload_date"]:
                entry["upload_date"] = entry["upload_date"].strftime("%Y-%m-%d")
            series_data.append(entry)
        data.append({
            "meta": model_to_dict(series),
            "episodes": series_data
        })

    return jsonify(data)


@app.route("/api/suggestion")
def api_suggestion():
    until = request.args.get('until')
    series = request.args.get('series')
    if series not in suggestions:
        abort(404)
    all_suggestions = suggestions[series]
    if until == "-":
        possible_suggestions = [s.text for s in all_suggestions]
    else:
        possible_suggestions = [s.text for s in all_suggestions if s.episode <= int(until)]
    chosen_suggestion = random.choice(possible_suggestions)
    return Response(chosen_suggestion, mimetype='text/plain')


@app.route("/api/transcript")
@cache.cached(timeout=60 * 60 * 24)
def transcript():
    series = request.args.get('series')
    episode_number = request.args.get('episode')

    episode = Episode.select(Episode, Series).where(
        (Episode.episode_number == episode_number)
        &
        (Episode.series.slug == series)
    ).join(Series).get()

    lines: List[Line] = Line.select(Line, Person).where(
        (Episode.episode_number == episode_number)
        &
        (Episode.series.slug == series)
    ).order_by(Line.order) \
        .join(Person, join_type=JOIN.FULL).switch(Line) \
        .join(Episode).join(Series)

    line_data = []
    for line in lines:
        entry = model_to_dict(line, exclude=global_excludes + [Line.episode])

        line_data.append(entry)

    return jsonify({
        "episode": model_to_dict(episode, exclude=global_excludes),
        "lines": line_data
    })


@app.route("/api/stats")
@cache.cached(timeout=60 * 60 * 24)
def stats():
    return jsonify({
        "TotalWords": TotalWords().as_data(),
        "MostCommonNounChunks": MostCommonNounChunks().as_data(),
        "LongestNounChunks": LongestNounChunks().as_data(),
        "LinesPerPerson": LinesPerPerson().as_data()
    })


@app.route("/api/stats/text")
@cache.cached(timeout=60 * 60 * 24)
def stats_text():
    text = ""

    for stats_class in [TotalWords, MostCommonNounChunks, LongestNounChunks, LinesPerPerson]:
        text += type(stats_class()).__name__.center(100, "#") + "\n"
        text += stats_class().as_plaintext() + "\n\n"

    return Response(text, mimetype='text/plain')


if __name__ == "__main__":
    import logging

    logger = logging.getLogger('peewee')
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.DEBUG)
    app.debug = True
    app.after_request(add_cors)
    app.run()
add suggestions 2021-07-13 22:21:26 +02:00			`import random`
add episode overview 2020-08-07 22:29:54 +02:00			`from typing import List`
many major changes 2020-03-08 18:48:14 +01:00
better suggestion errors 2021-07-18 11:18:09 +02:00			`from flask import request, jsonify, Response, abort`
correct join in search 2021-07-16 11:50:56 +02:00			`from peewee import fn, Alias, SQL, DoesNotExist, Expression, ModelSelect, JOIN`
use websearch_to_tsquery to build query from search 2020-08-07 18:09:46 +02:00			`from playhouse.postgres_ext import TS_MATCH`
initial commit 2020-03-07 10:45:39 +01:00			`from playhouse.shortcuts import model_to_dict`
			`from psycopg2._psycopg import cursor`

add redis caching and better database 2021-07-07 17:27:21 +02:00			`from app import app, db, cache`
initial commit 2020-03-07 10:45:39 +01:00			`from models import *`
add benchmark 2020-08-15 12:27:16 +02:00			`# logger = logging.getLogger('peewee')`
			`# logger.addHandler(logging.StreamHandler())`
			`# logger.setLevel(logging.DEBUG)`
add stats endpoint 2021-10-26 21:03:45 +02:00			`from stats import TotalWords, MostCommonNounChunks, LongestNounChunks, LinesPerPerson`
add suggestions 2021-07-13 22:21:26 +02:00			`from suggestions import suggestions`
more efficent suggestions 2020-03-08 14:48:04 +01:00

better typing 2020-08-30 22:11:28 +02:00			`def add_cors(response: Response) -> Response:`
initial commit 2020-03-07 10:45:39 +01:00			`header = response.headers`
			`header['Access-Control-Allow-Origin'] = '*'`
			`return response`


add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`def suggest(query: str, until: int, series: str, limit: int = 10) -> ModelSelect:`
			`return Phrase.select(Phrase.text, Alias(fn.SUM(Phrase.count), "total_count")).join(Episode).join(Series).where(`
			`(Episode.series.slug == series) &`
add benchmark 2020-08-15 12:27:16 +02:00			`(Episode.episode_number <= until) &`
			`(Phrase.text.contains(query))`
			`).group_by(Phrase.text).order_by(SQL("total_count DESC")).limit(limit)`


add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`def search(query: str, until: int, series: str, limit: int = 50) -> ModelSelect:`
add benchmark 2020-08-15 12:27:16 +02:00			`a = Alias(fn.ts_rank_cd(Line.search_text, fn.websearch_to_tsquery('english', query), 1 + 4), "rank")`

			`return Line.select(Line, Person, Episode, Series, a).where(`
			`Expression(Line.search_text, TS_MATCH, fn.websearch_to_tsquery('english', query))`
			`&`
			`(Episode.episode_number <= until)`
			`&`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`(Episode.series.slug == series)`
add benchmark 2020-08-15 12:27:16 +02:00			`).order_by(SQL("rank DESC")) \`
add more series 2021-10-26 15:30:14 +02:00			`.join(Person, join_type=JOIN.FULL).switch(Line) \`
add benchmark 2020-08-15 12:27:16 +02:00			`.join(Episode).join(Series) \`
			`.limit(limit)`


better series selector UI 2021-07-06 17:39:25 +02:00			`def exact_search(query: str, until: int, series: str, limit: int = 50) -> ModelSelect:`
			`return Line.select(Line, Person, Episode, Series).where(`
			`(Episode.episode_number <= until)`
			`&`
			`(Episode.series.slug == series)`
			`&`
			`(Line.text.contains(query))`
			`).order_by(Episode.video_number, Line.order) \`
			`.join(Person).switch(Line) \`
			`.join(Episode).join(Series) \`
			`.limit(limit)`


allow updated subtitles to invalidate existing data 2020-08-08 15:18:14 +02:00			`global_excludes = [Line.search_text, Episode.phrases_imported, Episode.text_imported, Person.series, Episode.title]`
update dependencies 2020-04-07 11:08:52 +02:00

more efficent suggestions 2020-03-08 14:48:04 +01:00			`@app.route("/api/suggest")`
add benchmark 2020-08-15 12:27:16 +02:00			`def api_question():`
more efficent suggestions 2020-03-08 14:48:04 +01:00			`query: str = request.args.get('query')`
initial commit 2020-03-07 10:45:39 +01:00			`until = request.args.get('until')`
better series selector UI 2021-07-06 17:39:25 +02:00			`if until == "-":`
			`until = 1000`
generalise data model to series 2020-04-15 18:11:45 +02:00			`series = request.args.get('series')`
			`if not query or not until or not series:`
initial commit 2020-03-07 10:45:39 +01:00			`return "no suggest query", 400`
allow longer search terms 2021-07-15 22:24:56 +02:00			`if len(query) > 500:`
many major changes 2020-03-08 18:48:14 +01:00			`return "too long query", 400`
cache suggestions 2021-07-07 18:30:28 +02:00			`cache_key = f"suggest_{until}_{series}_{query}"`
			`if len(query) < 3:`
			`result = cache.get(cache_key)`
			`if result:`
			`return jsonify(result)`
add benchmark 2020-08-15 12:27:16 +02:00			`phrases = suggest(query, until, series)`
cache suggestions 2021-07-07 18:30:28 +02:00			`result = [p.text for p in phrases]`
			`if len(query) < 3:`
			`cache.set(cache_key, result, timeout=60 * 60 * 24 * 7)`
			`return jsonify(result)`
initial commit 2020-03-07 10:45:39 +01:00

more efficent suggestions 2020-03-08 14:48:04 +01:00			`@app.route("/api/search")`
add benchmark 2020-08-15 12:27:16 +02:00			`def api_search():`
initial commit 2020-03-07 10:45:39 +01:00			`query = request.args.get('query')`
			`until = request.args.get('until')`
better series selector UI 2021-07-06 17:39:25 +02:00			`if until == "-":`
			`until = 1000`
generalise data model to series 2020-04-15 18:11:45 +02:00			`series = request.args.get('series')`
better series selector UI 2021-07-06 17:39:25 +02:00			`exact = request.args.get('exact', False)`
add redis caching and better database 2021-07-07 17:27:21 +02:00			`exact = False # don't allow exact searches`
generalise data model to series 2020-04-15 18:11:45 +02:00			`if not query or not until or not series:`
only clear cache if things changed 2021-07-07 20:08:23 +02:00			`return "no search query", 400`
allow longer search terms 2021-07-15 22:24:56 +02:00			`if len(query) > 500:`
many major changes 2020-03-08 18:48:14 +01:00			`return "too long query", 400`
initial commit 2020-03-07 10:45:39 +01:00
better series selector UI 2021-07-06 17:39:25 +02:00			`if exact:`
			`results = exact_search(query, until, series)`
			`else:`
			`results = search(query, until, series)`

			`if len(results) == 0:`
			`result: cursor = db.execute_sql("select websearch_to_tsquery('english',%s)", [query])`
			`parsed = result.fetchone()[0]`
			`if not parsed:`
			`return jsonify({`
			`"status": "warning",`
			`"message": "Only stop words were used. Please try to add a less common word to the search."`
			`})`
			`else:`
			`resp: Response = jsonify({"status": "warning", "message": f"No results were found for {parsed}"})`
			`resp.status_code = 404`
			`return resp`
initial commit 2020-03-07 10:45:39 +01:00
			`data = []`
			`d: Line`
			`ri = 0`
			`for d in results:`
add redis caching and better database 2021-07-07 17:27:21 +02:00			`entry = model_to_dict(d, extra_attrs=[] if exact else ["rank"],`
			`exclude=global_excludes + [Episode.subtitle_hash])`
better series selector UI 2021-07-06 17:39:25 +02:00			`if not exact:`
			`entry["rank"] = float(entry["rank"])`
initial commit 2020-03-07 10:45:39 +01:00			`data.append({"centerID": d.id, "resultID": ri, "offset": 1, "lines": [entry]})`
			`ri += 1`

			`return jsonify(data)`


more efficent suggestions 2020-03-08 14:48:04 +01:00			`@app.route("/api/expand")`
add benchmark 2020-08-15 12:27:16 +02:00			`def api_expand():`
initial commit 2020-03-07 10:45:39 +01:00			`center_id = request.args.get('centerID')`
			`offset = int(request.args.get('offset', 1))`
			`if not center_id:`
			`return "no central line ID", 400`

			`try:`
			`center: Line = Line.select().where(Line.id == center_id).get()`

			`except DoesNotExist:`
			`return "not found", 404`

			`lines = Line.select().where(`
			`(Line.episode == center.episode) & (Line.order << [center.order - offset, center.order + offset])`
			`)`
			`l: Line`
			`data = []`
			`for l in lines:`
update dependencies 2020-04-07 11:08:52 +02:00			`entry = model_to_dict(l, exclude=global_excludes)`
initial commit 2020-03-07 10:45:39 +01:00			`data.append(entry)`

			`return jsonify(data)`


some minor updates 2021-05-25 20:53:09 +02:00			`@app.route("/api/series")`
add redis caching and better database 2021-07-07 17:27:21 +02:00			`@cache.cached(timeout=60 * 60 * 24)`
some minor updates 2021-05-25 20:53:09 +02:00			`def series():`
			`series_list = []`
add more series 2021-10-26 15:30:14 +02:00			`for series in Series.select().order_by(Series.order):`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`last_episode: Episode = Episode.select().where(Episode.series == series).order_by(`
			`Episode.upload_date.desc()).limit(`
			`1).get()`
add more series 2021-10-26 15:30:14 +02:00			`series_data = model_to_dict(series, exclude=[Series.order])`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`series_data["last_upload"] = last_episode.upload_date.strftime("%Y-%m-%d")`
			`series_data["length"] = Episode.select().where(Episode.series == series).count()`
			`series_list.append(series_data)`
some minor updates 2021-05-25 20:53:09 +02:00			`return jsonify({`
			`"series": series_list`
			`})`


add episode overview 2020-08-07 22:29:54 +02:00			`@app.route("/api/episodes")`
add redis caching and better database 2021-07-07 17:27:21 +02:00			`@cache.cached(timeout=60 * 60 * 24)`
add benchmark 2020-08-15 12:27:16 +02:00			`def api_episodes():`
add more series 2021-10-26 15:30:14 +02:00			`all_series: List[Series] = Series.select().order_by(Series.order)`
add episode overview 2020-08-07 22:29:54 +02:00			`data = []`
			`for series in all_series:`

			`episodes: List[Episode] = Episode.select().where(Episode.series == series).order_by(Episode.video_number)`

			`series_data = []`
			`for episode in episodes:`
allow updated subtitles to invalidate existing data 2020-08-08 15:18:14 +02:00			`entry = model_to_dict(episode, exclude=[Episode.series, Episode.title])`
add one-shot support and lots of additional series 2021-07-04 22:24:51 +02:00			`if entry["upload_date"]:`
			`entry["upload_date"] = entry["upload_date"].strftime("%Y-%m-%d")`
add episode overview 2020-08-07 22:29:54 +02:00			`series_data.append(entry)`
			`data.append({`
			`"meta": model_to_dict(series),`
			`"episodes": series_data`
			`})`

			`return jsonify(data)`


add suggestions 2021-07-13 22:21:26 +02:00			`@app.route("/api/suggestion")`
			`def api_suggestion():`
			`until = request.args.get('until')`
			`series = request.args.get('series')`
			`if series not in suggestions:`
better suggestion errors 2021-07-18 11:18:09 +02:00			`abort(404)`
add suggestions 2021-07-13 22:21:26 +02:00			`all_suggestions = suggestions[series]`
			`if until == "-":`
			`possible_suggestions = [s.text for s in all_suggestions]`
			`else:`
			`possible_suggestions = [s.text for s in all_suggestions if s.episode <= int(until)]`
			`chosen_suggestion = random.choice(possible_suggestions)`
			`return Response(chosen_suggestion, mimetype='text/plain')`


more color and transcript api 2021-10-26 18:10:00 +02:00			`@app.route("/api/transcript")`
			`@cache.cached(timeout=60 * 60 * 24)`
			`def transcript():`
			`series = request.args.get('series')`
			`episode_number = request.args.get('episode')`

			`episode = Episode.select(Episode, Series).where(`
			`(Episode.episode_number == episode_number)`
			`&`
			`(Episode.series.slug == series)`
			`).join(Series).get()`

			`lines: List[Line] = Line.select(Line, Person).where(`
			`(Episode.episode_number == episode_number)`
			`&`
			`(Episode.series.slug == series)`
			`).order_by(Line.order) \`
			`.join(Person, join_type=JOIN.FULL).switch(Line) \`
			`.join(Episode).join(Series)`

			`line_data = []`
			`for line in lines:`
			`entry = model_to_dict(line, exclude=global_excludes + [Line.episode])`

			`line_data.append(entry)`

			`return jsonify({`
			`"episode": model_to_dict(episode, exclude=global_excludes),`
			`"lines": line_data`
			`})`


add stats endpoint 2021-10-26 21:03:45 +02:00			`@app.route("/api/stats")`
			`@cache.cached(timeout=60 * 60 * 24)`
			`def stats():`
			`return jsonify({`
			`"TotalWords": TotalWords().as_data(),`
			`"MostCommonNounChunks": MostCommonNounChunks().as_data(),`
			`"LongestNounChunks": LongestNounChunks().as_data(),`
			`"LinesPerPerson": LinesPerPerson().as_data()`
			`})`


			`@app.route("/api/stats/text")`
			`@cache.cached(timeout=60 * 60 * 24)`
			`def stats_text():`
			`text = ""`

			`for stats_class in [TotalWords, MostCommonNounChunks, LongestNounChunks, LinesPerPerson]:`
			`text += type(stats_class()).__name__.center(100, "#") + "\n"`
			`text += stats_class().as_plaintext() + "\n\n"`

			`return Response(text, mimetype='text/plain')`


initial commit 2020-03-07 10:45:39 +01:00			`if __name__ == "__main__":`
correct join in search 2021-07-16 11:50:56 +02:00			`import logging`
add more series 2021-10-26 15:30:14 +02:00
correct join in search 2021-07-16 11:50:56 +02:00			`logger = logging.getLogger('peewee')`
			`logger.addHandler(logging.StreamHandler())`
			`logger.setLevel(logging.DEBUG)`
initial commit 2020-03-07 10:45:39 +01:00			`app.debug = True`
			`app.after_request(add_cors)`
			`app.run()`