1
0
Fork 0
mirror of https://github.com/Findus23/pyLanguagetool.git synced 2024-09-09 04:13:46 +02:00

Add private wordlist features and spellcheck docs

This commit is contained in:
Johannes Hoppe 2019-05-16 14:58:25 +02:00
parent 93ed83e138
commit fe9bd06522
7 changed files with 58 additions and 11 deletions

View file

@ -14,7 +14,9 @@ tests:
docs:
stage: docs
script: python setup.py build_sphinx -W
script:
- python setup.py build_sphinx -W
- pylanguagetool --pwl spelling.txt --disabled-rules=WHITESPACE_RULE README.rst
check:
stage: check
@ -31,4 +33,5 @@ bdist:
stages:
- check
- tests
- docs
- build

View file

@ -18,3 +18,4 @@ script:
- python -m pytest
- python setup.py check -ms
- python setup.py build_sphinx -W
- pylanguagetool --pwl spelling.txt --disabled-rules=WHITESPACE_RULE README.rst

View file

@ -6,7 +6,7 @@ pyLanguagetool
A python library and CLI for the LanguageTool_ `JSON API`_.
LanguageTool_ is an open source spellchecking platform. It supports a large
variety of
variety of languages and has advanced grammar support.
Installation
------------
@ -48,9 +48,10 @@ replacements.
Configuration
-------------
All `Languagetool API`_ parameters can be set via commandline arguments,
environment variables or a configuration file (~/.config/pyLanguagetool.conf)
For more information about the configuration file syntax, read the `ConfigArgParse documentation`_
All `LanguageTool API`_ parameters can be set via command line arguments,
environment variables or a configuration file
(``~/.config/pyLanguagetool.conf``) For more information about the
configuration file syntax, read the `ConfigArgParse documentation`_.
Parameters
----------
@ -64,7 +65,7 @@ Parameters
[-e ENABLED_RULES] [-d DISABLED_RULES]
[--enabled-categories ENABLED_CATEGORIES]
[--disabled-categories DISABLED_CATEGORIES]
[--enabled-only]
[--enabled-only] [--pwl PWL]
[input file]
Args that start with '--' (eg. -v) can also be set in a config file
@ -121,6 +122,13 @@ Parameters
--enabled-only enable only the rules and categories whose IDs are
specified with --enabled-rules or --enabled-categories
--pwl PWL, --personal-word-list PWL
File name of personal dictionary. A private dictionary
can be used to add special words that would otherwise
be marked as spelling errors. [env var:
PERSONAL_WORD_LIST]
.. |license| image:: https://img.shields.io/badge/license-MIT-blue.svg
:target: https://raw.githubusercontent.com/Findus23/pyLanguagetool/master/LICENSE
@ -135,14 +143,15 @@ Parameters
Privacy
-------
By default pyLangugagetool sends all text via HTTPS to the languagetool.org server (see their `privacy policy`_).
You can also `setup your own server`_ and use it by changing --api-url.
By default pyLanguagetool sends all text via HTTPS to the `LanguageTool`_
server (see their `privacy policy`_). You can also `setup your own server`_ and
use it by changing the ``--api-url`` attribute.
.. _LanguageTool: https://languagetool.org/
.. _JSON API: https://languagetool.org/http-api/swagger-ui/#/default
.. _Languagetool API: https://languagetool.org/http-api/swagger-ui/#/default
.. _LanguageTool API: https://languagetool.org/http-api/swagger-ui/#/default
.. _ConfigArgParse documentation: https://github.com/bw2/ConfigArgParse#config-file-syntax

View file

@ -49,6 +49,13 @@ def init_config():
p.add_argument("--enabled-only", action='store_true', default=False,
help="enable only the rules and categories whose IDs are specified with --enabled-rules or --enabled-categories"
)
p.add_argument(
'--pwl', '--personal-word-list',
env_var='PERSONAL_WORD_LIST', help=(
'File name of personal dictionary. A private dictionary'
' can be used to add special words that would otherwise'
' be marked as spelling errors.'
))
c = vars(p.parse_args())
if c["enabled_only"] and (c["disabled_categories"] or c["disabled_rules"]):
@ -166,6 +173,10 @@ def main():
if config["verbose"]:
print(sys.version)
if config['pwl']:
with open(config['pwl'], 'r') as fs:
config['pwl'] = [w.strip() for w in fs.readlines()]
input_text, inputtype = get_input_text(config)
if not inputtype:
inputtype = config["input_type"]

View file

@ -41,10 +41,18 @@ def get_languages(api_url):
return r.json()
def _is_in_pwl(match, pwl):
start = match['context']['offset']
end = start + match['context']['length']
word = match['context']['text'][start:end]
return word in pwl
def check(input_text, api_url, lang, mother_tongue=None, preferred_variants=None,
enabled_rules=None, disabled_rules=None,
enabled_categories=None, disabled_categories=None,
enabled_only=False, verbose=False,
pwl=None,
**kwargs):
"""
Check given text and return API response as a dictionary.
@ -96,6 +104,10 @@ def check(input_text, api_url, lang, mother_tongue=None, preferred_variants=None
If ``True``, a more verbose output will be printed. Defaults to
``False``.
pwl (List[str]):
Personal world list. A custom dictionary of words that should be
excluded from spell checking errors.
Returns:
dict:
A dictionary representation of the JSON API response.
@ -157,10 +169,18 @@ def check(input_text, api_url, lang, mother_tongue=None, preferred_variants=None
post_parameters["disabledCategories"] = disabled_categories
if enabled_only:
post_parameters["enabledOnly"] = True
r = requests.post(api_url + "check", data=post_parameters)
if r.status_code != 200:
raise ValueError(r.text)
if verbose:
print(post_parameters)
print(r.json())
return r.json()
data = r.json()
if pwl:
matches = data.pop('matches', [])
data['matches'] = [
match for match in matches
if not _is_in_pwl(match, pwl)
]
return data

View file

@ -29,7 +29,7 @@ def html2text(html):
sys.exit(4)
soup = BeautifulSoup(html, "html.parser")
# remove scripts from html
for script in soup(["script", "style", "code", "pre"]):
for script in soup(["script", "style", "code", "pre"]) + soup("span", {"class": "literal"}):
script.extract()
return soup.get_text()

3
spelling.txt Normal file
View file

@ -0,0 +1,3 @@
pyLanguagetool
pipenv
ConfigArgParse