1
0
Fork 0
mirror of https://github.com/Findus23/cr-search.git synced 2024-09-19 15:23:44 +02:00

add a lot of typos and a way to quickly find them

This commit is contained in:
Lukas Winkler 2020-08-11 12:49:49 +02:00
parent 84ca5e017c
commit 6cc0f86f74
Signed by: lukas
GPG key ID: 54DE4D798D244853
2 changed files with 32 additions and 6 deletions

16
typo.py
View file

@ -3,19 +3,23 @@ replace common typos of names to unify them in the database
""" """
typos = { typos = {
"Matt": {"Mat", "Mattt", "\"Matt", "Matr"}, "Matt": {"Mat", "Mattt", "\"Matt", "Matr", "Mtt"},
"Sam": {"San", "Nott", "Sma", "Sasm", "Sm"}, "Sam": {"San", "Nott", "Sma", "Sasm", "Sm", "Ssam"},
"Travis": {"Tarvis", "Travs", "Travia", "Traivs"}, "Travis": {"Tarvis", "Travs", "Travia", "Traivs", "Tavis", "Trvis"},
"Taliesin": {"Taiesin", "Talisin", "Talisen", "Taleisn", "Talisein", "Talieisin", "Talesin", "Talisan", "Taleisin", "Taliesin": {"Taiesin", "Talisin", "Talisen", "Taleisn", "Talisein", "Talieisin", "Talesin", "Talisan", "Taleisin",
"Talieisn", "Talisien"}, "Talieisn", "Talisien", "Tailesin"},
"Marisha": {"Beau", "Mariasha", "Maisha", "Marisa", "Marish", "Marihsa", "Marsha", "Marsisha", "Marishaa", "Marisha": {"Beau", "Mariasha", "Maisha", "Marisa", "Marish", "Marihsa", "Marsha", "Marsisha", "Marishaa",
"Marihsha", "\\Marisha", "Marisah", "Marissa"}, "Marihsha", "\\Marisha", "Marisah", "Marissa", "Marirsha", "Marisaha", "Mairsha", "Marshia", "Marsiha",
"Marishia", "Marsiah", "Matisha", "Mraisha", "Amrisha", "<Arisha"},
"Laura": {"Lauda", "Lauren", "Larua", "Laur", "Lauar", "Vex", "Laira"}, "Laura": {"Lauda", "Lauren", "Larua", "Laur", "Lauar", "Vex", "Laira"},
"Liam": {"Caleb", "Laim", "Vax"}, "Liam": {"Caleb", "Laim", "Vax"},
"Ashley": {"Ashly", "Ashely", "Ashey"}, "Ashley": {"Ashly", "Ashely", "Ashey", "Aslhey", "Ahsley"},
"All": {"Everyone", "Everybody"}, "All": {"Everyone", "Everybody"},
"Mark": {"Marik"}, "Mark": {"Marik"},
"Brian": {"Brain"}, "Brian": {"Brain"},
"Joe": {"Jroe"},
"Man Off-Camera": {"Man Off Camera"},
"Off-Screen": {"Offscreen"}
} }
replacements = {} replacements = {}
for correct, typoset in typos.items(): for correct, typoset in typos.items():

22
typo_find.py Normal file
View file

@ -0,0 +1,22 @@
from difflib import SequenceMatcher
from itertools import combinations
from models import Person
people = set()
for p in Person.select():
if "\n" not in p.name:
if "," in p.name:
names = [n.strip() for n in p.name.split(",")]
people.update(names)
else:
people.add(p.name)
print(people)
for a, b in combinations(people, r=2):
s = SequenceMatcher(None, a, b)
ratio = s.ratio()
if ratio < 0.8 or ratio == 1:
continue
print(a, "|", b)
print(ratio)