2017-02-22 10:44:15 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# Copyright (C) 2017 Lukas Winkler
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify it under
|
|
|
|
# the terms of the GNU General Public License as published by the Free Software
|
|
|
|
# Foundation, either version 3 of the License, or (at your option) any later
|
|
|
|
# version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT
|
|
|
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
|
|
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License along with
|
|
|
|
# this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
2016-12-23 14:23:20 +01:00
|
|
|
import shutil
|
|
|
|
import sys
|
2016-12-23 15:19:57 +01:00
|
|
|
import urllib.parse
|
2016-12-27 18:33:44 +01:00
|
|
|
import os.path
|
2016-12-23 14:23:20 +01:00
|
|
|
import requests
|
|
|
|
import yaml
|
2016-12-23 15:19:57 +01:00
|
|
|
from bs4 import BeautifulSoup
|
2016-12-23 14:23:20 +01:00
|
|
|
|
|
|
|
|
2016-12-26 14:21:20 +01:00
|
|
|
def load_yaml(file):
|
|
|
|
with open(file, 'r') as stream:
|
|
|
|
return yaml.load(stream)
|
2016-12-23 14:23:20 +01:00
|
|
|
|
|
|
|
|
2016-12-27 18:33:44 +01:00
|
|
|
def download_favicon(homepage_html, url, target_file):
|
2016-12-26 14:21:20 +01:00
|
|
|
"""
|
|
|
|
Detect favicon if linked via "<link rel="shortcut icon" href="/favicon.ico">"
|
|
|
|
otherwise assume /favicon.ico
|
|
|
|
"""
|
2016-12-27 18:33:44 +01:00
|
|
|
print(url, target_file)
|
2016-12-26 14:21:20 +01:00
|
|
|
soup = BeautifulSoup(homepage_html, "html.parser")
|
|
|
|
favicon_element = soup.find("link", rel="shortcut icon")
|
2016-12-26 19:02:40 +01:00
|
|
|
if favicon_element and favicon_element.has_attr("href"):
|
2016-12-26 14:21:20 +01:00
|
|
|
favicon_path = favicon_element['href']
|
2016-12-26 19:02:40 +01:00
|
|
|
elif soup.find("link", rel="icon") and soup.find("link", rel="icon").has_attr("href"):
|
2016-12-26 18:21:23 +01:00
|
|
|
# some sites don't use "shortcut icon" for favicon
|
|
|
|
# in this case we take the first other icon and hope it fits
|
|
|
|
favicon_path = soup.find("link", rel="icon")["href"]
|
2016-12-23 14:23:20 +01:00
|
|
|
else:
|
2016-12-26 14:21:20 +01:00
|
|
|
favicon_path = "/favicon.ico"
|
|
|
|
print(favicon_path)
|
|
|
|
# Works with relative and absolute favicon_paths:
|
|
|
|
favicon_url = urllib.parse.urljoin("http://" + url, favicon_path)
|
|
|
|
print(favicon_url)
|
|
|
|
try:
|
|
|
|
r = requests.get(favicon_url, stream=True)
|
|
|
|
if r.status_code == 200:
|
2016-12-27 18:33:44 +01:00
|
|
|
with open(outputdir + target_file + ".ico", 'wb') as f:
|
2016-12-26 14:21:20 +01:00
|
|
|
r.raw.decode_content = True
|
|
|
|
shutil.copyfileobj(r.raw, f)
|
2016-12-27 18:33:44 +01:00
|
|
|
return True
|
2016-12-23 14:23:20 +01:00
|
|
|
|
2016-12-27 18:33:44 +01:00
|
|
|
except (
|
|
|
|
requests.exceptions.ConnectionError,
|
|
|
|
requests.exceptions.InvalidSchema,
|
|
|
|
requests.exceptions.TooManyRedirects):
|
2016-12-26 14:21:20 +01:00
|
|
|
print("Error while downloading favicon")
|
2016-12-23 16:07:44 +01:00
|
|
|
|
2016-12-26 14:21:20 +01:00
|
|
|
|
|
|
|
def main(search_engines):
|
|
|
|
for i, element in search_engines.items():
|
|
|
|
if MODE == "searchengines":
|
|
|
|
search_engine = element[0]
|
|
|
|
urls = search_engine["urls"]
|
|
|
|
else:
|
|
|
|
urls = element
|
2016-12-27 18:33:44 +01:00
|
|
|
first_url = None
|
|
|
|
success = False
|
2016-12-26 14:21:20 +01:00
|
|
|
for url in urls:
|
2016-12-27 18:33:44 +01:00
|
|
|
if "{}" not in url and "/" not in url:
|
|
|
|
if first_url is None:
|
|
|
|
first_url = url
|
2016-12-26 14:21:20 +01:00
|
|
|
print(url)
|
2016-12-27 18:33:44 +01:00
|
|
|
if not os.path.isfile(outputdir + first_url + ".ico"):
|
|
|
|
try:
|
|
|
|
offline = False
|
|
|
|
r = requests.get("http://" + url, timeout=15)
|
|
|
|
r.raise_for_status()
|
|
|
|
except requests.exceptions.ReadTimeout as e:
|
|
|
|
print("http://" + url + " " + "Timeout", file=sys.stderr)
|
|
|
|
offline = True
|
|
|
|
except requests.exceptions.TooManyRedirects as e:
|
|
|
|
print("http://" + url + " " + "Too many Redirects", file=sys.stderr)
|
|
|
|
offline = True
|
|
|
|
except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError,
|
|
|
|
requests.exceptions.TooManyRedirects) as e:
|
|
|
|
print("http://" + url + " " + str(e), file=sys.stderr)
|
|
|
|
offline = True
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
|
print("http://" + url + " " + str(e.args[0].reason), file=sys.stderr)
|
|
|
|
offline = True
|
2016-12-26 14:21:20 +01:00
|
|
|
|
2016-12-27 18:33:44 +01:00
|
|
|
if not offline:
|
|
|
|
success = download_favicon(r.content, url, first_url)
|
|
|
|
if success:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
print("file already downloaded")
|
2016-12-26 14:21:20 +01:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2016-12-26 18:21:23 +01:00
|
|
|
MODE = sys.argv[1] if len(sys.argv) >= 2 else ""
|
2016-12-26 14:21:20 +01:00
|
|
|
|
|
|
|
if MODE == "searchengines":
|
|
|
|
yamlfile = "vendor/piwik/searchengine-and-social-list/SearchEngines.yml"
|
|
|
|
outputdir = "src/Referrers/images/searchEngines/"
|
|
|
|
elif MODE == "socials":
|
|
|
|
yamlfile = "vendor/piwik/searchengine-and-social-list/Socials.yml"
|
|
|
|
outputdir = "src/Referrers/images/socials/"
|
|
|
|
else:
|
|
|
|
yamlfile = outputdir = False
|
|
|
|
print('Invalid mode. Valid modes: "searchengines" or "socials"')
|
|
|
|
exit(1)
|
|
|
|
try:
|
|
|
|
finished = [line.rstrip('\n') for line in open('finished.txt')]
|
|
|
|
except FileNotFoundError:
|
|
|
|
finished = []
|
2016-12-23 16:07:44 +01:00
|
|
|
|
2016-12-26 14:21:20 +01:00
|
|
|
main(load_yaml(yamlfile))
|