From 3c772b780580d4d6663a66b479af51d3c04fe071 Mon Sep 17 00:00:00 2001 From: Lukas Winkler Date: Sat, 18 Jul 2020 21:59:42 +0200 Subject: [PATCH] store wikipedia images separatly --- acros/admin.py | 7 +- acros/migrations/0039_auto_20200718_1919.py | 74 +++++++++++ acros/migrations/0040_auto_20200718_1920.py | 24 ++++ acros/migrations/0041_auto_20200718_1937.py | 23 ++++ acros/models/WikipediaImage.py | 54 ++++++++ acros/models/WikipediaLink.py | 43 +++--- acros/models/__init__.py | 1 + acros/templates/acros/detail.html | 10 +- acros/utils/apis.py | 137 +++++++++++++++++--- acros/utils/html.py | 9 ++ 10 files changed, 331 insertions(+), 51 deletions(-) create mode 100644 acros/migrations/0039_auto_20200718_1919.py create mode 100644 acros/migrations/0040_auto_20200718_1920.py create mode 100644 acros/migrations/0041_auto_20200718_1937.py create mode 100644 acros/models/WikipediaImage.py create mode 100644 acros/utils/html.py diff --git a/acros/admin.py b/acros/admin.py index 6c79d83..87102c2 100644 --- a/acros/admin.py +++ b/acros/admin.py @@ -2,7 +2,7 @@ from django.contrib import admin # Register your models here. from simple_history.admin import SimpleHistoryAdmin -from acros.models import Acronym, Weblink, PaperReference, WikipediaLink, Tag, Host +from acros.models import Acronym, Weblink, PaperReference, WikipediaLink, Tag, Host, WikipediaImage class OwnInline(admin.TabularInline): @@ -50,8 +50,8 @@ class LinkAdmin(SimpleHistoryAdmin): class WikipediaAdmin(SimpleHistoryAdmin): - readonly_fields = ["thumbnail_height", "thumbnail_width"] - + # readonly_fields = ["thumbnail_height", "thumbnail_width"] + ... admin.site.register(WikipediaLink, WikipediaAdmin) admin.site.register(Weblink, LinkAdmin) @@ -59,6 +59,7 @@ admin.site.register(PaperReference, PaperAdmin) admin.site.register(Tag, TagAdmin) admin.site.register(Acronym, AcronymAdmin) admin.site.register(Host) +admin.site.register(WikipediaImage) admin.site.site_header="Acronomy Administration" admin.site.site_title="Acronomy Administration" diff --git a/acros/migrations/0039_auto_20200718_1919.py b/acros/migrations/0039_auto_20200718_1919.py new file mode 100644 index 0000000..1b30481 --- /dev/null +++ b/acros/migrations/0039_auto_20200718_1919.py @@ -0,0 +1,74 @@ +# Generated by Django 3.0.8 on 2020-07-18 19:19 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('acros', '0038_auto_20200718_1727'), + ] + + operations = [ + migrations.CreateModel( + name='WikipediaImage', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('filename', models.CharField(max_length=200)), + ('pageid', models.IntegerField()), + ('thumbnail', models.ImageField(blank=True, null=True, upload_to='wikipedia_thumbnails/')), + ('thumb_width', models.IntegerField(blank=True, editable=False, null=True)), + ('thumb_height', models.IntegerField(blank=True, editable=False, null=True)), + ('imageurl', models.URLField()), + ('caption', models.CharField(blank=True, max_length=1000, null=True)), + ('credit', models.TextField()), + ('artist', models.TextField()), + ('license_short_name', models.TextField()), + ('attribution', models.TextField()), + ('license_url', models.URLField()), + ('attribution_required', models.BooleanField()), + ('copyrighted', models.BooleanField()), + ('timestamp', models.DateTimeField(blank=True)), + ], + ), + migrations.RemoveField( + model_name='historicalwikipedialink', + name='thumbnail', + ), + migrations.RemoveField( + model_name='historicalwikipedialink', + name='thumbnail_caption', + ), + migrations.RemoveField( + model_name='historicalwikipedialink', + name='thumbnail_height', + ), + migrations.RemoveField( + model_name='historicalwikipedialink', + name='thumbnail_title', + ), + migrations.RemoveField( + model_name='historicalwikipedialink', + name='thumbnail_width', + ), + migrations.RemoveField( + model_name='wikipedialink', + name='thumbnail', + ), + migrations.RemoveField( + model_name='wikipedialink', + name='thumbnail_caption', + ), + migrations.RemoveField( + model_name='wikipedialink', + name='thumbnail_height', + ), + migrations.RemoveField( + model_name='wikipedialink', + name='thumbnail_title', + ), + migrations.RemoveField( + model_name='wikipedialink', + name='thumbnail_width', + ), + ] diff --git a/acros/migrations/0040_auto_20200718_1920.py b/acros/migrations/0040_auto_20200718_1920.py new file mode 100644 index 0000000..f9a7485 --- /dev/null +++ b/acros/migrations/0040_auto_20200718_1920.py @@ -0,0 +1,24 @@ +# Generated by Django 3.0.8 on 2020-07-18 19:20 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('acros', '0039_auto_20200718_1919'), + ] + + operations = [ + migrations.AddField( + model_name='historicalwikipedialink', + name='thumbnail', + field=models.ForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='acros.WikipediaImage'), + ), + migrations.AddField( + model_name='wikipedialink', + name='thumbnail', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='wiki_articles', to='acros.WikipediaImage'), + ), + ] diff --git a/acros/migrations/0041_auto_20200718_1937.py b/acros/migrations/0041_auto_20200718_1937.py new file mode 100644 index 0000000..98f11bd --- /dev/null +++ b/acros/migrations/0041_auto_20200718_1937.py @@ -0,0 +1,23 @@ +# Generated by Django 3.0.8 on 2020-07-18 19:37 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('acros', '0040_auto_20200718_1920'), + ] + + operations = [ + migrations.AlterField( + model_name='wikipediaimage', + name='attribution', + field=models.TextField(blank=True, null=True), + ), + migrations.AlterField( + model_name='wikipediaimage', + name='license_url', + field=models.URLField(blank=True, null=True), + ), + ] diff --git a/acros/models/WikipediaImage.py b/acros/models/WikipediaImage.py new file mode 100644 index 0000000..e4a46db --- /dev/null +++ b/acros/models/WikipediaImage.py @@ -0,0 +1,54 @@ +from tempfile import TemporaryFile + +import requests +from django.core.files import File +from django.db import models + +from acros.utils.apis import WikipediaImageAPIObject + + +class WikipediaImage(models.Model): + filename = models.CharField(max_length=200) + pageid = models.IntegerField() + thumbnail = models.ImageField(upload_to="wikipedia_images/", blank=True, null=True) + thumb_width = models.IntegerField(blank=True, editable=False, null=True) + thumb_height = models.IntegerField(blank=True, editable=False, null=True) + imageurl = models.URLField() + caption = models.CharField(max_length=1000, null=True, blank=True) + credit = models.TextField() + artist = models.TextField() + license_short_name = models.TextField() + attribution = models.TextField(null=True, blank=True) + license_url = models.URLField(null=True, blank=True) + attribution_required = models.BooleanField() + copyrighted = models.BooleanField() + timestamp = models.DateTimeField(blank=True) + + def save(self, *args, **kwargs): + img = WikipediaImageAPIObject(self.filename) + with TemporaryFile("rb+") as fd: + r = requests.get(img.thumburl) + for chunk in r.iter_content(chunk_size=128): + fd.write(chunk) + image_file = File(fd) + self.thumbnail.save(self.filename, image_file, save=False) + self.thumb_width, self.thumb_height = img.thumb_size + self.pageid = img.pageid + self.imageurl = img.url + self.credit = img.credit + self.artist = img.artist + self.license_short_name = img.license_short_name + self.attribution = img.attribution + self.license_url = img.license_url + self.attribution_required = img.attribution_required + self.copyrighted = img.copyrighted + self.timestamp = img.timestamp + + super(WikipediaImage, self).save(*args, **kwargs) + + @property + def commons_url(self): + return f"https://commons.wikimedia.org/wiki/File:{self.filename}" + + def __str__(self): + return self.filename diff --git a/acros/models/WikipediaLink.py b/acros/models/WikipediaLink.py index c2c570d..dc2d772 100644 --- a/acros/models/WikipediaLink.py +++ b/acros/models/WikipediaLink.py @@ -1,12 +1,8 @@ -from tempfile import TemporaryFile - -import requests -from django.core.files import File from django.db import models from simple_history.models import HistoricalRecords -from acros.models import Acronym -from acros.utils.apis import fetch_wikipedia_summary +from acros.models import Acronym, WikipediaImage +from acros.utils.apis import WikipediaAPISummary class WikipediaLink(models.Model): @@ -14,28 +10,27 @@ class WikipediaLink(models.Model): title = models.CharField(max_length=200) extract = models.TextField(blank=True) extract_html = models.TextField(blank=True) - thumbnail = models.ImageField(upload_to="wikipedia_thumbnails/", blank=True, null=True, - height_field="thumbnail_height", width_field="thumbnail_width") - thumbnail_width = models.IntegerField(blank=True, editable=False, null=True) - thumbnail_height = models.IntegerField(blank=True, editable=False, null=True) - thumbnail_title = models.CharField(max_length=100, null=True, blank=True) - thumbnail_caption = models.CharField(max_length=1000, null=True, blank=True) + thumbnail = models.ForeignKey(WikipediaImage, on_delete=models.CASCADE, related_name="wiki_articles", + blank=True, null=True) timestamp = models.DateTimeField(blank=True) fetched = models.BooleanField(default=False) history = HistoricalRecords() def save(self, *args, **kwargs): if not self.fetched: - self.extract, self.extract_html, self.timestamp, thumbnail_url, \ - self.thumbnail_title, self.thumbnail_caption = fetch_wikipedia_summary(self.title) - if thumbnail_url: - with TemporaryFile("rb+") as fd: - r = requests.get(thumbnail_url) - filename = thumbnail_url.split("/")[-1] - for chunk in r.iter_content(chunk_size=128): - fd.write(chunk) - image_file = File(fd) - self.thumbnail.save(filename, image_file, save=False) + summary = WikipediaAPISummary(self.title) + self.extract = summary.extract + self.extract_html = summary.extract_html + self.timestamp = summary.timestamp + self.title = summary.title + if summary.image: + filename = summary.image.split("/")[-1] + try: + thumbnail = WikipediaImage.objects.get(filename=filename) + except WikipediaImage.DoesNotExist: + thumbnail = WikipediaImage.objects.create(filename=filename) + thumbnail.save() + self.thumbnail = thumbnail self.fetched = True super(WikipediaLink, self).save(*args, **kwargs) @@ -44,9 +39,5 @@ class WikipediaLink(models.Model): def url(self): return f"https://en.wikipedia.org/wiki/{self.title}" - @property - def thumbnail_wiki_url(self): - return f"https://en.wikipedia.org/wiki/{self.thumbnail_title}" - def __str__(self): return self.title diff --git a/acros/models/__init__.py b/acros/models/__init__.py index a524757..b451e62 100644 --- a/acros/models/__init__.py +++ b/acros/models/__init__.py @@ -4,4 +4,5 @@ from .AcroOfTheDay import AcroOfTheDay from .Host import Host from .PaperReference import PaperReference from .Weblink import Weblink +from .WikipediaImage import WikipediaImage from .WikipediaLink import WikipediaLink diff --git a/acros/templates/acros/detail.html b/acros/templates/acros/detail.html index f6e6af2..ed48b4c 100644 --- a/acros/templates/acros/detail.html +++ b/acros/templates/acros/detail.html @@ -62,9 +62,8 @@
{% if link.thumbnail %} - + {% endif %}
@@ -81,7 +80,10 @@ title="This snippet and thumbnail is from the English Wikipedia and licensed under the CC-BY-SA 3.0 license"> License - + Image-Source
diff --git a/acros/utils/apis.py b/acros/utils/apis.py index 3597245..71e86f3 100644 --- a/acros/utils/apis.py +++ b/acros/utils/apis.py @@ -1,26 +1,40 @@ +from typing import Tuple, Optional + import requests from bs4 import BeautifulSoup +from acros.utils.html import clean_html, string_to_bool -def fetch_wikipedia_summary(title: str): - r = requests.get("https://en.wikipedia.org/api/rest_v1/page/summary/" + title) - r.raise_for_status() - data = r.json() - # print(data) - r2 = requests.get("https://en.wikipedia.org/api/rest_v1/page/media-list/" + title) - r2.raise_for_status() - image_data = r2.json()["items"] - if len(image_data) > 0: - image_title = image_data[0]["title"] - image_caption = image_data[0]["caption"]["text"] if "caption" in image_data else None - else: - image_title = image_caption = None - return ( - data["extract"], data["extract_html"], data["timestamp"], - data["thumbnail"]["source"] if "thumbnail" in data else None, - image_title, image_caption - ) +class WikipediaAPISummary: + urlbase = "https://en.wikipedia.org/api/rest_v1/page/summary/" + + def __init__(self, title: str): + r = requests.get(self.urlbase + title) + r.raise_for_status() + self.data = r.json() + + @property + def title(self) -> str: + return self.data["title"] + + @property + def extract(self) -> str: + return self.data["extract"] + + @property + def extract_html(self) -> str: + return self.data["extract_html"] + + @property + def timestamp(self) -> str: + return self.data["timestamp"] + + @property + def image(self) -> Optional[str]: + if "originalimage" in self.data: + return self.data["originalimage"]["source"] + return None def get_website_title(url: str) -> str: @@ -29,3 +43,90 @@ def get_website_title(url: str) -> str: soup = BeautifulSoup(r.text, features="html.parser") title = soup.find("title") return title.text + + +class WikipediaImageAPIObject: + def __init__(self, filename: str): + self.filename = filename + print(self.api_url) + r = requests.get(self.api_url) + r.raise_for_status() + self.data = r.json() + self.image_obj = list(self.data["query"]["pages"].values())[0] + + @classmethod + def from_url(cls, url: str): + return cls(url.split("/")[-1]) + + @property + def api_url(self): + return "https://commons.wikimedia.org/w/api.php" \ + "?action=query" \ + "&format=json" \ + f"&titles=File:{self.filename}" \ + "&prop=imageinfo" \ + "&iiprop=extmetadata|size|url|timestamp" \ + "&iiurlwidth=500" + + @property + def pageid(self) -> int: + return self.image_obj["pageid"] + + @property + def imageinfo(self): + return self.image_obj["imageinfo"][0] + + @property + def timestamp(self) -> str: + return self.imageinfo["timestamp"] + + @property + def thumb_size(self) -> Tuple[int, int]: + return self.imageinfo["thumbwidth"], self.imageinfo["thumbheight"] + + @property + def url(self) -> str: + return self.imageinfo["url"] + + @property + def thumburl(self) -> str: + return self.imageinfo["thumburl"] + + @property + def extmetadata(self): + return self.imageinfo["extmetadata"] + + @property + def image_description(self) -> str: + return clean_html(self.extmetadata["ImageDescription"]["value"]) + + @property + def credit(self) -> str: + print(self.extmetadata["Credit"]) + return clean_html(self.extmetadata["Credit"]["value"]) + + @property + def artist(self) -> str: + return clean_html(self.extmetadata["Artist"]["value"]) + + @property + def license_short_name(self) -> str: + return self.extmetadata["LicenseShortName"]["value"] + + @property + def license_url(self) -> Optional[str]: + if "LicenseUrl" in self.extmetadata: + return self.extmetadata["LicenseUrl"]["value"] + + @property + def attribution_required(self) -> bool: + return string_to_bool(self.extmetadata["AttributionRequired"]["value"]) + + @property + def copyrighted(self) -> bool: + return string_to_bool(self.extmetadata["Copyrighted"]["value"]) + + @property + def attribution(self) -> Optional[str]: + if "Attribution" in self.extmetadata: + return self.extmetadata["Attribution"]["value"] diff --git a/acros/utils/html.py b/acros/utils/html.py new file mode 100644 index 0000000..2c57df8 --- /dev/null +++ b/acros/utils/html.py @@ -0,0 +1,9 @@ +from bs4 import BeautifulSoup + + +def clean_html(html: str) -> str: + return BeautifulSoup(html, "html.parser").text + + +def string_to_bool(string: str) -> bool: + return string.lower() in ["true"]