1
0
Fork 0
mirror of https://github.com/Findus23/acronomy.git synced 2024-09-19 15:33:45 +02:00

store wikipedia images separatly

This commit is contained in:
Lukas Winkler 2020-07-18 21:59:42 +02:00
parent ebdf847032
commit 3c772b7805
Signed by: lukas
GPG key ID: 54DE4D798D244853
10 changed files with 331 additions and 51 deletions

View file

@ -2,7 +2,7 @@ from django.contrib import admin
# Register your models here.
from simple_history.admin import SimpleHistoryAdmin
from acros.models import Acronym, Weblink, PaperReference, WikipediaLink, Tag, Host
from acros.models import Acronym, Weblink, PaperReference, WikipediaLink, Tag, Host, WikipediaImage
class OwnInline(admin.TabularInline):
@ -50,8 +50,8 @@ class LinkAdmin(SimpleHistoryAdmin):
class WikipediaAdmin(SimpleHistoryAdmin):
readonly_fields = ["thumbnail_height", "thumbnail_width"]
# readonly_fields = ["thumbnail_height", "thumbnail_width"]
...
admin.site.register(WikipediaLink, WikipediaAdmin)
admin.site.register(Weblink, LinkAdmin)
@ -59,6 +59,7 @@ admin.site.register(PaperReference, PaperAdmin)
admin.site.register(Tag, TagAdmin)
admin.site.register(Acronym, AcronymAdmin)
admin.site.register(Host)
admin.site.register(WikipediaImage)
admin.site.site_header="Acronomy Administration"
admin.site.site_title="Acronomy Administration"

View file

@ -0,0 +1,74 @@
# Generated by Django 3.0.8 on 2020-07-18 19:19
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('acros', '0038_auto_20200718_1727'),
]
operations = [
migrations.CreateModel(
name='WikipediaImage',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('filename', models.CharField(max_length=200)),
('pageid', models.IntegerField()),
('thumbnail', models.ImageField(blank=True, null=True, upload_to='wikipedia_thumbnails/')),
('thumb_width', models.IntegerField(blank=True, editable=False, null=True)),
('thumb_height', models.IntegerField(blank=True, editable=False, null=True)),
('imageurl', models.URLField()),
('caption', models.CharField(blank=True, max_length=1000, null=True)),
('credit', models.TextField()),
('artist', models.TextField()),
('license_short_name', models.TextField()),
('attribution', models.TextField()),
('license_url', models.URLField()),
('attribution_required', models.BooleanField()),
('copyrighted', models.BooleanField()),
('timestamp', models.DateTimeField(blank=True)),
],
),
migrations.RemoveField(
model_name='historicalwikipedialink',
name='thumbnail',
),
migrations.RemoveField(
model_name='historicalwikipedialink',
name='thumbnail_caption',
),
migrations.RemoveField(
model_name='historicalwikipedialink',
name='thumbnail_height',
),
migrations.RemoveField(
model_name='historicalwikipedialink',
name='thumbnail_title',
),
migrations.RemoveField(
model_name='historicalwikipedialink',
name='thumbnail_width',
),
migrations.RemoveField(
model_name='wikipedialink',
name='thumbnail',
),
migrations.RemoveField(
model_name='wikipedialink',
name='thumbnail_caption',
),
migrations.RemoveField(
model_name='wikipedialink',
name='thumbnail_height',
),
migrations.RemoveField(
model_name='wikipedialink',
name='thumbnail_title',
),
migrations.RemoveField(
model_name='wikipedialink',
name='thumbnail_width',
),
]

View file

@ -0,0 +1,24 @@
# Generated by Django 3.0.8 on 2020-07-18 19:20
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('acros', '0039_auto_20200718_1919'),
]
operations = [
migrations.AddField(
model_name='historicalwikipedialink',
name='thumbnail',
field=models.ForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='acros.WikipediaImage'),
),
migrations.AddField(
model_name='wikipedialink',
name='thumbnail',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='wiki_articles', to='acros.WikipediaImage'),
),
]

View file

@ -0,0 +1,23 @@
# Generated by Django 3.0.8 on 2020-07-18 19:37
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('acros', '0040_auto_20200718_1920'),
]
operations = [
migrations.AlterField(
model_name='wikipediaimage',
name='attribution',
field=models.TextField(blank=True, null=True),
),
migrations.AlterField(
model_name='wikipediaimage',
name='license_url',
field=models.URLField(blank=True, null=True),
),
]

View file

@ -0,0 +1,54 @@
from tempfile import TemporaryFile
import requests
from django.core.files import File
from django.db import models
from acros.utils.apis import WikipediaImageAPIObject
class WikipediaImage(models.Model):
filename = models.CharField(max_length=200)
pageid = models.IntegerField()
thumbnail = models.ImageField(upload_to="wikipedia_images/", blank=True, null=True)
thumb_width = models.IntegerField(blank=True, editable=False, null=True)
thumb_height = models.IntegerField(blank=True, editable=False, null=True)
imageurl = models.URLField()
caption = models.CharField(max_length=1000, null=True, blank=True)
credit = models.TextField()
artist = models.TextField()
license_short_name = models.TextField()
attribution = models.TextField(null=True, blank=True)
license_url = models.URLField(null=True, blank=True)
attribution_required = models.BooleanField()
copyrighted = models.BooleanField()
timestamp = models.DateTimeField(blank=True)
def save(self, *args, **kwargs):
img = WikipediaImageAPIObject(self.filename)
with TemporaryFile("rb+") as fd:
r = requests.get(img.thumburl)
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
image_file = File(fd)
self.thumbnail.save(self.filename, image_file, save=False)
self.thumb_width, self.thumb_height = img.thumb_size
self.pageid = img.pageid
self.imageurl = img.url
self.credit = img.credit
self.artist = img.artist
self.license_short_name = img.license_short_name
self.attribution = img.attribution
self.license_url = img.license_url
self.attribution_required = img.attribution_required
self.copyrighted = img.copyrighted
self.timestamp = img.timestamp
super(WikipediaImage, self).save(*args, **kwargs)
@property
def commons_url(self):
return f"https://commons.wikimedia.org/wiki/File:{self.filename}"
def __str__(self):
return self.filename

View file

@ -1,12 +1,8 @@
from tempfile import TemporaryFile
import requests
from django.core.files import File
from django.db import models
from simple_history.models import HistoricalRecords
from acros.models import Acronym
from acros.utils.apis import fetch_wikipedia_summary
from acros.models import Acronym, WikipediaImage
from acros.utils.apis import WikipediaAPISummary
class WikipediaLink(models.Model):
@ -14,28 +10,27 @@ class WikipediaLink(models.Model):
title = models.CharField(max_length=200)
extract = models.TextField(blank=True)
extract_html = models.TextField(blank=True)
thumbnail = models.ImageField(upload_to="wikipedia_thumbnails/", blank=True, null=True,
height_field="thumbnail_height", width_field="thumbnail_width")
thumbnail_width = models.IntegerField(blank=True, editable=False, null=True)
thumbnail_height = models.IntegerField(blank=True, editable=False, null=True)
thumbnail_title = models.CharField(max_length=100, null=True, blank=True)
thumbnail_caption = models.CharField(max_length=1000, null=True, blank=True)
thumbnail = models.ForeignKey(WikipediaImage, on_delete=models.CASCADE, related_name="wiki_articles",
blank=True, null=True)
timestamp = models.DateTimeField(blank=True)
fetched = models.BooleanField(default=False)
history = HistoricalRecords()
def save(self, *args, **kwargs):
if not self.fetched:
self.extract, self.extract_html, self.timestamp, thumbnail_url, \
self.thumbnail_title, self.thumbnail_caption = fetch_wikipedia_summary(self.title)
if thumbnail_url:
with TemporaryFile("rb+") as fd:
r = requests.get(thumbnail_url)
filename = thumbnail_url.split("/")[-1]
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
image_file = File(fd)
self.thumbnail.save(filename, image_file, save=False)
summary = WikipediaAPISummary(self.title)
self.extract = summary.extract
self.extract_html = summary.extract_html
self.timestamp = summary.timestamp
self.title = summary.title
if summary.image:
filename = summary.image.split("/")[-1]
try:
thumbnail = WikipediaImage.objects.get(filename=filename)
except WikipediaImage.DoesNotExist:
thumbnail = WikipediaImage.objects.create(filename=filename)
thumbnail.save()
self.thumbnail = thumbnail
self.fetched = True
super(WikipediaLink, self).save(*args, **kwargs)
@ -44,9 +39,5 @@ class WikipediaLink(models.Model):
def url(self):
return f"https://en.wikipedia.org/wiki/{self.title}"
@property
def thumbnail_wiki_url(self):
return f"https://en.wikipedia.org/wiki/{self.thumbnail_title}"
def __str__(self):
return self.title

View file

@ -4,4 +4,5 @@ from .AcroOfTheDay import AcroOfTheDay
from .Host import Host
from .PaperReference import PaperReference
from .Weblink import Weblink
from .WikipediaImage import WikipediaImage
from .WikipediaLink import WikipediaLink

View file

@ -62,9 +62,8 @@
<div class="card">
{% if link.thumbnail %}
<a href="{{ link.url }}">
<img src="{{ link.thumbnail.url }}" class="card-img-top"
width="{{ link.thumbnail_width }}" height="{{ link.thumbnail_height }}"
{% if link.thumbnail_caption %}title="{{ link.thumbnail_caption }}"{% endif %}>
<img src="{{ link.thumbnail.thumbnail.url }}" class="card-img-top"
width="{{ link.thumbnail.thumbnail_width }}" height="{{ link.thumbnail.thumbnail_height }}">
</a>
{% endif %}
<div class="card-body">
@ -81,7 +80,10 @@
title="This snippet and thumbnail is from the English Wikipedia and licensed under the CC-BY-SA 3.0 license">
License
</a>
<a href="{{ link.thumbnail_wiki_url }}" target="_blank" rel="noopener" class="text-muted">
<a href="{{ link.thumbnail.commons_url }}" target="_blank" rel="noopener"
class="text-muted"
data-toggle="tooltip"
title="by {{ link.thumbnail.artist }} under {{ link.thumbnail.license_short_name }}">
Image-Source
</a>
</div>

View file

@ -1,26 +1,40 @@
from typing import Tuple, Optional
import requests
from bs4 import BeautifulSoup
from acros.utils.html import clean_html, string_to_bool
def fetch_wikipedia_summary(title: str):
r = requests.get("https://en.wikipedia.org/api/rest_v1/page/summary/" + title)
r.raise_for_status()
data = r.json()
# print(data)
r2 = requests.get("https://en.wikipedia.org/api/rest_v1/page/media-list/" + title)
r2.raise_for_status()
image_data = r2.json()["items"]
if len(image_data) > 0:
image_title = image_data[0]["title"]
image_caption = image_data[0]["caption"]["text"] if "caption" in image_data else None
else:
image_title = image_caption = None
return (
data["extract"], data["extract_html"], data["timestamp"],
data["thumbnail"]["source"] if "thumbnail" in data else None,
image_title, image_caption
)
class WikipediaAPISummary:
urlbase = "https://en.wikipedia.org/api/rest_v1/page/summary/"
def __init__(self, title: str):
r = requests.get(self.urlbase + title)
r.raise_for_status()
self.data = r.json()
@property
def title(self) -> str:
return self.data["title"]
@property
def extract(self) -> str:
return self.data["extract"]
@property
def extract_html(self) -> str:
return self.data["extract_html"]
@property
def timestamp(self) -> str:
return self.data["timestamp"]
@property
def image(self) -> Optional[str]:
if "originalimage" in self.data:
return self.data["originalimage"]["source"]
return None
def get_website_title(url: str) -> str:
@ -29,3 +43,90 @@ def get_website_title(url: str) -> str:
soup = BeautifulSoup(r.text, features="html.parser")
title = soup.find("title")
return title.text
class WikipediaImageAPIObject:
def __init__(self, filename: str):
self.filename = filename
print(self.api_url)
r = requests.get(self.api_url)
r.raise_for_status()
self.data = r.json()
self.image_obj = list(self.data["query"]["pages"].values())[0]
@classmethod
def from_url(cls, url: str):
return cls(url.split("/")[-1])
@property
def api_url(self):
return "https://commons.wikimedia.org/w/api.php" \
"?action=query" \
"&format=json" \
f"&titles=File:{self.filename}" \
"&prop=imageinfo" \
"&iiprop=extmetadata|size|url|timestamp" \
"&iiurlwidth=500"
@property
def pageid(self) -> int:
return self.image_obj["pageid"]
@property
def imageinfo(self):
return self.image_obj["imageinfo"][0]
@property
def timestamp(self) -> str:
return self.imageinfo["timestamp"]
@property
def thumb_size(self) -> Tuple[int, int]:
return self.imageinfo["thumbwidth"], self.imageinfo["thumbheight"]
@property
def url(self) -> str:
return self.imageinfo["url"]
@property
def thumburl(self) -> str:
return self.imageinfo["thumburl"]
@property
def extmetadata(self):
return self.imageinfo["extmetadata"]
@property
def image_description(self) -> str:
return clean_html(self.extmetadata["ImageDescription"]["value"])
@property
def credit(self) -> str:
print(self.extmetadata["Credit"])
return clean_html(self.extmetadata["Credit"]["value"])
@property
def artist(self) -> str:
return clean_html(self.extmetadata["Artist"]["value"])
@property
def license_short_name(self) -> str:
return self.extmetadata["LicenseShortName"]["value"]
@property
def license_url(self) -> Optional[str]:
if "LicenseUrl" in self.extmetadata:
return self.extmetadata["LicenseUrl"]["value"]
@property
def attribution_required(self) -> bool:
return string_to_bool(self.extmetadata["AttributionRequired"]["value"])
@property
def copyrighted(self) -> bool:
return string_to_bool(self.extmetadata["Copyrighted"]["value"])
@property
def attribution(self) -> Optional[str]:
if "Attribution" in self.extmetadata:
return self.extmetadata["Attribution"]["value"]

9
acros/utils/html.py Normal file
View file

@ -0,0 +1,9 @@
from bs4 import BeautifulSoup
def clean_html(html: str) -> str:
return BeautifulSoup(html, "html.parser").text
def string_to_bool(string: str) -> bool:
return string.lower() in ["true"]