mirror of
https://github.com/Findus23/cr-search.git
synced 2024-09-11 06:03:45 +02:00
add a lot of typos and a way to quickly find them
This commit is contained in:
parent
84ca5e017c
commit
6cc0f86f74
2 changed files with 32 additions and 6 deletions
16
typo.py
16
typo.py
|
@ -3,19 +3,23 @@ replace common typos of names to unify them in the database
|
|||
"""
|
||||
|
||||
typos = {
|
||||
"Matt": {"Mat", "Mattt", "\"Matt", "Matr"},
|
||||
"Sam": {"San", "Nott", "Sma", "Sasm", "Sm"},
|
||||
"Travis": {"Tarvis", "Travs", "Travia", "Traivs"},
|
||||
"Matt": {"Mat", "Mattt", "\"Matt", "Matr", "Mtt"},
|
||||
"Sam": {"San", "Nott", "Sma", "Sasm", "Sm", "Ssam"},
|
||||
"Travis": {"Tarvis", "Travs", "Travia", "Traivs", "Tavis", "Trvis"},
|
||||
"Taliesin": {"Taiesin", "Talisin", "Talisen", "Taleisn", "Talisein", "Talieisin", "Talesin", "Talisan", "Taleisin",
|
||||
"Talieisn", "Talisien"},
|
||||
"Talieisn", "Talisien", "Tailesin"},
|
||||
"Marisha": {"Beau", "Mariasha", "Maisha", "Marisa", "Marish", "Marihsa", "Marsha", "Marsisha", "Marishaa",
|
||||
"Marihsha", "\\Marisha", "Marisah", "Marissa"},
|
||||
"Marihsha", "\\Marisha", "Marisah", "Marissa", "Marirsha", "Marisaha", "Mairsha", "Marshia", "Marsiha",
|
||||
"Marishia", "Marsiah", "Matisha", "Mraisha", "Amrisha", "<Arisha"},
|
||||
"Laura": {"Lauda", "Lauren", "Larua", "Laur", "Lauar", "Vex", "Laira"},
|
||||
"Liam": {"Caleb", "Laim", "Vax"},
|
||||
"Ashley": {"Ashly", "Ashely", "Ashey"},
|
||||
"Ashley": {"Ashly", "Ashely", "Ashey", "Aslhey", "Ahsley"},
|
||||
"All": {"Everyone", "Everybody"},
|
||||
"Mark": {"Marik"},
|
||||
"Brian": {"Brain"},
|
||||
"Joe": {"Jroe"},
|
||||
"Man Off-Camera": {"Man Off Camera"},
|
||||
"Off-Screen": {"Offscreen"}
|
||||
}
|
||||
replacements = {}
|
||||
for correct, typoset in typos.items():
|
||||
|
|
22
typo_find.py
Normal file
22
typo_find.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
from difflib import SequenceMatcher
|
||||
from itertools import combinations
|
||||
|
||||
from models import Person
|
||||
|
||||
people = set()
|
||||
for p in Person.select():
|
||||
if "\n" not in p.name:
|
||||
if "," in p.name:
|
||||
names = [n.strip() for n in p.name.split(",")]
|
||||
people.update(names)
|
||||
else:
|
||||
people.add(p.name)
|
||||
print(people)
|
||||
|
||||
for a, b in combinations(people, r=2):
|
||||
s = SequenceMatcher(None, a, b)
|
||||
ratio = s.ratio()
|
||||
if ratio < 0.8 or ratio == 1:
|
||||
continue
|
||||
print(a, "|", b)
|
||||
print(ratio)
|
Loading…
Reference in a new issue