1
0
Fork 0
mirror of https://github.com/matomo-org/matomo-icons.git synced 2024-09-19 17:03:45 +02:00
matomo-icons/referrers.py

125 lines
4.7 KiB
Python
Raw Normal View History

2017-02-22 10:44:15 +01:00
#!/usr/bin/env python3
# Copyright (C) 2017 Lukas Winkler
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
import shutil
import sys
2016-12-23 15:19:57 +01:00
import urllib.parse
2016-12-27 18:33:44 +01:00
import os.path
import requests
import yaml
2016-12-23 15:19:57 +01:00
from bs4 import BeautifulSoup
2016-12-26 14:21:20 +01:00
def load_yaml(file):
with open(file, 'r') as stream:
return yaml.load(stream)
2016-12-27 18:33:44 +01:00
def download_favicon(homepage_html, url, target_file):
2016-12-26 14:21:20 +01:00
"""
Detect favicon if linked via "<link rel="shortcut icon" href="/favicon.ico">"
otherwise assume /favicon.ico
"""
2016-12-27 18:33:44 +01:00
print(url, target_file)
2016-12-26 14:21:20 +01:00
soup = BeautifulSoup(homepage_html, "html.parser")
favicon_element = soup.find("link", rel="shortcut icon")
2016-12-26 19:02:40 +01:00
if favicon_element and favicon_element.has_attr("href"):
2016-12-26 14:21:20 +01:00
favicon_path = favicon_element['href']
2016-12-26 19:02:40 +01:00
elif soup.find("link", rel="icon") and soup.find("link", rel="icon").has_attr("href"):
2016-12-26 18:21:23 +01:00
# some sites don't use "shortcut icon" for favicon
# in this case we take the first other icon and hope it fits
favicon_path = soup.find("link", rel="icon")["href"]
else:
2016-12-26 14:21:20 +01:00
favicon_path = "/favicon.ico"
print(favicon_path)
# Works with relative and absolute favicon_paths:
favicon_url = urllib.parse.urljoin("http://" + url, favicon_path)
print(favicon_url)
try:
r = requests.get(favicon_url, stream=True)
if r.status_code == 200:
2016-12-27 18:33:44 +01:00
with open(outputdir + target_file + ".ico", 'wb') as f:
2016-12-26 14:21:20 +01:00
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
2016-12-27 18:33:44 +01:00
return True
2016-12-27 18:33:44 +01:00
except (
requests.exceptions.ConnectionError,
requests.exceptions.InvalidSchema,
requests.exceptions.TooManyRedirects):
2016-12-26 14:21:20 +01:00
print("Error while downloading favicon")
2016-12-23 16:07:44 +01:00
2016-12-26 14:21:20 +01:00
def main(search_engines):
for i, element in search_engines.items():
if MODE == "searchengines":
search_engine = element[0]
urls = search_engine["urls"]
else:
urls = element
2016-12-27 18:33:44 +01:00
first_url = None
success = False
2016-12-26 14:21:20 +01:00
for url in urls:
2016-12-27 18:33:44 +01:00
if "{}" not in url and "/" not in url:
if first_url is None:
first_url = url
2016-12-26 14:21:20 +01:00
print(url)
2016-12-27 18:33:44 +01:00
if not os.path.isfile(outputdir + first_url + ".ico"):
try:
offline = False
r = requests.get("http://" + url, timeout=15)
r.raise_for_status()
except requests.exceptions.ReadTimeout as e:
print("http://" + url + " " + "Timeout", file=sys.stderr)
offline = True
except requests.exceptions.TooManyRedirects as e:
print("http://" + url + " " + "Too many Redirects", file=sys.stderr)
offline = True
except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError,
requests.exceptions.TooManyRedirects) as e:
print("http://" + url + " " + str(e), file=sys.stderr)
offline = True
except requests.exceptions.RequestException as e:
print("http://" + url + " " + str(e.args[0].reason), file=sys.stderr)
offline = True
2016-12-26 14:21:20 +01:00
2016-12-27 18:33:44 +01:00
if not offline:
success = download_favicon(r.content, url, first_url)
if success:
break
else:
print("file already downloaded")
2016-12-26 14:21:20 +01:00
if __name__ == "__main__":
2016-12-26 18:21:23 +01:00
MODE = sys.argv[1] if len(sys.argv) >= 2 else ""
2016-12-26 14:21:20 +01:00
if MODE == "searchengines":
yamlfile = "vendor/piwik/searchengine-and-social-list/SearchEngines.yml"
2017-11-16 22:28:24 +01:00
outputdir = "src/searchEngines/"
2016-12-26 14:21:20 +01:00
elif MODE == "socials":
yamlfile = "vendor/piwik/searchengine-and-social-list/Socials.yml"
2017-11-16 22:28:24 +01:00
outputdir = "src/socials/"
2016-12-26 14:21:20 +01:00
else:
yamlfile = outputdir = False
print('Invalid mode. Valid modes: "searchengines" or "socials"')
exit(1)
try:
finished = [line.rstrip('\n') for line in open('finished.txt')]
except FileNotFoundError:
finished = []
2016-12-23 16:07:44 +01:00
2016-12-26 14:21:20 +01:00
main(load_yaml(yamlfile))