matomo-icons/referrers.py

#!/usr/bin/env python3
# Copyright (C) 2017 Lukas Winkler
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# this program.  If not, see <http://www.gnu.org/licenses/>.

import shutil
import sys
import urllib.parse
import os.path
import requests
import yaml
from bs4 import BeautifulSoup


def load_yaml(file):
    with open(file, 'r') as stream:
        return yaml.load(stream)


def download_favicon(homepage_html, url, target_file):
    """
    Detect favicon if linked via "<link rel="shortcut icon" href="/favicon.ico">"
    otherwise assume /favicon.ico
    """
    print(url, target_file)
    soup = BeautifulSoup(homepage_html, "html.parser")
    favicon_element = soup.find("link", rel="shortcut icon")
    if favicon_element and favicon_element.has_attr("href"):
        favicon_path = favicon_element['href']
    elif soup.find("link", rel="icon") and soup.find("link", rel="icon").has_attr("href"):
        # some sites don't use "shortcut icon" for favicon
        # in this case we take the first other icon and hope it fits
        favicon_path = soup.find("link", rel="icon")["href"]
    else:
        favicon_path = "/favicon.ico"
    print(favicon_path)
    # Works with relative and absolute favicon_paths:
    favicon_url = urllib.parse.urljoin("http://" + url, favicon_path)
    print(favicon_url)
    try:
        r = requests.get(favicon_url, stream=True)
        if r.status_code == 200:
            with open(outputdir + target_file + ".ico", 'wb') as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
                return True

    except (
            requests.exceptions.ConnectionError,
            requests.exceptions.InvalidSchema,
            requests.exceptions.TooManyRedirects):
        print("Error while downloading favicon")


def main(search_engines):
    for i, element in search_engines.items():
        if MODE == "searchengines":
            search_engine = element[0]
            urls = search_engine["urls"]
        else:
            urls = element
        first_url = None
        success = False
        for url in urls:
            if "{}" not in url and "/" not in url:
                if first_url is None:
                    first_url = url
                print(url)
                if not os.path.isfile(outputdir + first_url + ".ico"):
                    try:
                        offline = False
                        r = requests.get("http://" + url, timeout=15)
                        r.raise_for_status()
                    except requests.exceptions.ReadTimeout as e:
                        print("http://" + url + "  " + "Timeout", file=sys.stderr)
                        offline = True
                    except requests.exceptions.TooManyRedirects as e:
                        print("http://" + url + "  " + "Too many Redirects", file=sys.stderr)
                        offline = True
                    except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError,
                            requests.exceptions.TooManyRedirects) as e:
                        print("http://" + url + "  " + str(e), file=sys.stderr)
                        offline = True
                    except requests.exceptions.RequestException as e:
                        print("http://" + url + "  " + str(e.args[0].reason), file=sys.stderr)
                        offline = True

                    if not offline:
                        success = download_favicon(r.content, url, first_url)
                        if success:
                            break
                else:
                    print("file already downloaded")


if __name__ == "__main__":
    MODE = sys.argv[1] if len(sys.argv) >= 2 else ""

    if MODE == "searchengines":
        yamlfile = "vendor/piwik/searchengine-and-social-list/SearchEngines.yml"
        outputdir = "src/searchEngines/"
    elif MODE == "socials":
        yamlfile = "vendor/piwik/searchengine-and-social-list/Socials.yml"
        outputdir = "src/socials/"
    else:
        yamlfile = outputdir = False
        print('Invalid mode. Valid modes: "searchengines" or "socials"')
        exit(1)
    try:
        finished = [line.rstrip('\n') for line in open('finished.txt')]
    except FileNotFoundError:
        finished = []

    main(load_yaml(yamlfile))
add license 2017-02-22 10:44:15 +01:00			`#!/usr/bin/env python3`
			`# Copyright (C) 2017 Lukas Winkler`
			`#`
			`# This program is free software: you can redistribute it and/or modify it under`
			`# the terms of the GNU General Public License as published by the Free Software`
			`# Foundation, either version 3 of the License, or (at your option) any later`
			`# version.`
			`#`
			`# This program is distributed in the hope that it will be useful, but WITHOUT`
			`# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS`
			`# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License along with`
			`# this program. If not, see <http://www.gnu.org/licenses/>.`

add icons for referrers (searchEngines and socials) TODO: Parse html to find favicon in custom path 2016-12-23 14:23:20 +01:00			`import shutil`
			`import sys`
search for favicon with BeautifulSoup 2016-12-23 15:19:57 +01:00			`import urllib.parse`
removed unnecessary icons 2016-12-27 18:33:44 +01:00			`import os.path`
add icons for referrers (searchEngines and socials) TODO: Parse html to find favicon in custom path 2016-12-23 14:23:20 +01:00			`import requests`
			`import yaml`
search for favicon with BeautifulSoup 2016-12-23 15:19:57 +01:00			`from bs4 import BeautifulSoup`
add icons for referrers (searchEngines and socials) TODO: Parse html to find favicon in custom path 2016-12-23 14:23:20 +01:00

reorganized referrers.py 2016-12-26 14:21:20 +01:00			`def load_yaml(file):`
			`with open(file, 'r') as stream:`
			`return yaml.load(stream)`
add icons for referrers (searchEngines and socials) TODO: Parse html to find favicon in custom path 2016-12-23 14:23:20 +01:00

removed unnecessary icons 2016-12-27 18:33:44 +01:00			`def download_favicon(homepage_html, url, target_file):`
reorganized referrers.py 2016-12-26 14:21:20 +01:00			`"""`
			`Detect favicon if linked via "<link rel="shortcut icon" href="/favicon.ico">"`
			`otherwise assume /favicon.ico`
			`"""`
removed unnecessary icons 2016-12-27 18:33:44 +01:00			`print(url, target_file)`
reorganized referrers.py 2016-12-26 14:21:20 +01:00			`soup = BeautifulSoup(homepage_html, "html.parser")`
			`favicon_element = soup.find("link", rel="shortcut icon")`
fixed favicons 2016-12-26 19:02:40 +01:00			`if favicon_element and favicon_element.has_attr("href"):`
reorganized referrers.py 2016-12-26 14:21:20 +01:00			`favicon_path = favicon_element['href']`
fixed favicons 2016-12-26 19:02:40 +01:00			`elif soup.find("link", rel="icon") and soup.find("link", rel="icon").has_attr("href"):`
improved favicon search 2016-12-26 18:21:23 +01:00			`# some sites don't use "shortcut icon" for favicon`
			`# in this case we take the first other icon and hope it fits`
			`favicon_path = soup.find("link", rel="icon")["href"]`
add icons for referrers (searchEngines and socials) TODO: Parse html to find favicon in custom path 2016-12-23 14:23:20 +01:00			`else:`
reorganized referrers.py 2016-12-26 14:21:20 +01:00			`favicon_path = "/favicon.ico"`
			`print(favicon_path)`
			`# Works with relative and absolute favicon_paths:`
			`favicon_url = urllib.parse.urljoin("http://" + url, favicon_path)`
			`print(favicon_url)`
			`try:`
			`r = requests.get(favicon_url, stream=True)`
			`if r.status_code == 200:`
removed unnecessary icons 2016-12-27 18:33:44 +01:00			`with open(outputdir + target_file + ".ico", 'wb') as f:`
reorganized referrers.py 2016-12-26 14:21:20 +01:00			`r.raw.decode_content = True`
			`shutil.copyfileobj(r.raw, f)`
removed unnecessary icons 2016-12-27 18:33:44 +01:00			`return True`
add icons for referrers (searchEngines and socials) TODO: Parse html to find favicon in custom path 2016-12-23 14:23:20 +01:00
removed unnecessary icons 2016-12-27 18:33:44 +01:00			`except (`
			`requests.exceptions.ConnectionError,`
			`requests.exceptions.InvalidSchema,`
			`requests.exceptions.TooManyRedirects):`
reorganized referrers.py 2016-12-26 14:21:20 +01:00			`print("Error while downloading favicon")`
fixing favicon download 2016-12-23 16:07:44 +01:00
reorganized referrers.py 2016-12-26 14:21:20 +01:00
			`def main(search_engines):`
			`for i, element in search_engines.items():`
			`if MODE == "searchengines":`
			`search_engine = element[0]`
			`urls = search_engine["urls"]`
			`else:`
			`urls = element`
removed unnecessary icons 2016-12-27 18:33:44 +01:00			`first_url = None`
			`success = False`
reorganized referrers.py 2016-12-26 14:21:20 +01:00			`for url in urls:`
removed unnecessary icons 2016-12-27 18:33:44 +01:00			`if "{}" not in url and "/" not in url:`
			`if first_url is None:`
			`first_url = url`
reorganized referrers.py 2016-12-26 14:21:20 +01:00			`print(url)`
removed unnecessary icons 2016-12-27 18:33:44 +01:00			`if not os.path.isfile(outputdir + first_url + ".ico"):`
			`try:`
			`offline = False`
			`r = requests.get("http://" + url, timeout=15)`
			`r.raise_for_status()`
			`except requests.exceptions.ReadTimeout as e:`
			`print("http://" + url + " " + "Timeout", file=sys.stderr)`
			`offline = True`
			`except requests.exceptions.TooManyRedirects as e:`
			`print("http://" + url + " " + "Too many Redirects", file=sys.stderr)`
			`offline = True`
			`except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError,`
			`requests.exceptions.TooManyRedirects) as e:`
			`print("http://" + url + " " + str(e), file=sys.stderr)`
			`offline = True`
			`except requests.exceptions.RequestException as e:`
			`print("http://" + url + " " + str(e.args[0].reason), file=sys.stderr)`
			`offline = True`
reorganized referrers.py 2016-12-26 14:21:20 +01:00
removed unnecessary icons 2016-12-27 18:33:44 +01:00			`if not offline:`
			`success = download_favicon(r.content, url, first_url)`
			`if success:`
			`break`
			`else:`
			`print("file already downloaded")`
reorganized referrers.py 2016-12-26 14:21:20 +01:00

			`if __name__ == "__main__":`
improved favicon search 2016-12-26 18:21:23 +01:00			`MODE = sys.argv[1] if len(sys.argv) >= 2 else ""`
reorganized referrers.py 2016-12-26 14:21:20 +01:00
			`if MODE == "searchengines":`
			`yamlfile = "vendor/piwik/searchengine-and-social-list/SearchEngines.yml"`
fix paths 2017-11-16 22:28:24 +01:00			`outputdir = "src/searchEngines/"`
reorganized referrers.py 2016-12-26 14:21:20 +01:00			`elif MODE == "socials":`
			`yamlfile = "vendor/piwik/searchengine-and-social-list/Socials.yml"`
fix paths 2017-11-16 22:28:24 +01:00			`outputdir = "src/socials/"`
reorganized referrers.py 2016-12-26 14:21:20 +01:00			`else:`
			`yamlfile = outputdir = False`
			`print('Invalid mode. Valid modes: "searchengines" or "socials"')`
			`exit(1)`
			`try:`
			`finished = [line.rstrip('\n') for line in open('finished.txt')]`
			`except FileNotFoundError:`
			`finished = []`
fixing favicon download 2016-12-23 16:07:44 +01:00
reorganized referrers.py 2016-12-26 14:21:20 +01:00			`main(load_yaml(yamlfile))`