mirror of
https://github.com/Findus23/acronomy.git
synced 2024-09-19 15:33:45 +02:00
store wikipedia images separatly
This commit is contained in:
parent
ebdf847032
commit
3c772b7805
10 changed files with 331 additions and 51 deletions
|
@ -2,7 +2,7 @@ from django.contrib import admin
|
|||
# Register your models here.
|
||||
from simple_history.admin import SimpleHistoryAdmin
|
||||
|
||||
from acros.models import Acronym, Weblink, PaperReference, WikipediaLink, Tag, Host
|
||||
from acros.models import Acronym, Weblink, PaperReference, WikipediaLink, Tag, Host, WikipediaImage
|
||||
|
||||
|
||||
class OwnInline(admin.TabularInline):
|
||||
|
@ -50,8 +50,8 @@ class LinkAdmin(SimpleHistoryAdmin):
|
|||
|
||||
|
||||
class WikipediaAdmin(SimpleHistoryAdmin):
|
||||
readonly_fields = ["thumbnail_height", "thumbnail_width"]
|
||||
|
||||
# readonly_fields = ["thumbnail_height", "thumbnail_width"]
|
||||
...
|
||||
|
||||
admin.site.register(WikipediaLink, WikipediaAdmin)
|
||||
admin.site.register(Weblink, LinkAdmin)
|
||||
|
@ -59,6 +59,7 @@ admin.site.register(PaperReference, PaperAdmin)
|
|||
admin.site.register(Tag, TagAdmin)
|
||||
admin.site.register(Acronym, AcronymAdmin)
|
||||
admin.site.register(Host)
|
||||
admin.site.register(WikipediaImage)
|
||||
|
||||
admin.site.site_header="Acronomy Administration"
|
||||
admin.site.site_title="Acronomy Administration"
|
||||
|
|
74
acros/migrations/0039_auto_20200718_1919.py
Normal file
74
acros/migrations/0039_auto_20200718_1919.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
# Generated by Django 3.0.8 on 2020-07-18 19:19
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('acros', '0038_auto_20200718_1727'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='WikipediaImage',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('filename', models.CharField(max_length=200)),
|
||||
('pageid', models.IntegerField()),
|
||||
('thumbnail', models.ImageField(blank=True, null=True, upload_to='wikipedia_thumbnails/')),
|
||||
('thumb_width', models.IntegerField(blank=True, editable=False, null=True)),
|
||||
('thumb_height', models.IntegerField(blank=True, editable=False, null=True)),
|
||||
('imageurl', models.URLField()),
|
||||
('caption', models.CharField(blank=True, max_length=1000, null=True)),
|
||||
('credit', models.TextField()),
|
||||
('artist', models.TextField()),
|
||||
('license_short_name', models.TextField()),
|
||||
('attribution', models.TextField()),
|
||||
('license_url', models.URLField()),
|
||||
('attribution_required', models.BooleanField()),
|
||||
('copyrighted', models.BooleanField()),
|
||||
('timestamp', models.DateTimeField(blank=True)),
|
||||
],
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='historicalwikipedialink',
|
||||
name='thumbnail',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='historicalwikipedialink',
|
||||
name='thumbnail_caption',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='historicalwikipedialink',
|
||||
name='thumbnail_height',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='historicalwikipedialink',
|
||||
name='thumbnail_title',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='historicalwikipedialink',
|
||||
name='thumbnail_width',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='wikipedialink',
|
||||
name='thumbnail',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='wikipedialink',
|
||||
name='thumbnail_caption',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='wikipedialink',
|
||||
name='thumbnail_height',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='wikipedialink',
|
||||
name='thumbnail_title',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='wikipedialink',
|
||||
name='thumbnail_width',
|
||||
),
|
||||
]
|
24
acros/migrations/0040_auto_20200718_1920.py
Normal file
24
acros/migrations/0040_auto_20200718_1920.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# Generated by Django 3.0.8 on 2020-07-18 19:20
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('acros', '0039_auto_20200718_1919'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='historicalwikipedialink',
|
||||
name='thumbnail',
|
||||
field=models.ForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='acros.WikipediaImage'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='wikipedialink',
|
||||
name='thumbnail',
|
||||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='wiki_articles', to='acros.WikipediaImage'),
|
||||
),
|
||||
]
|
23
acros/migrations/0041_auto_20200718_1937.py
Normal file
23
acros/migrations/0041_auto_20200718_1937.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Generated by Django 3.0.8 on 2020-07-18 19:37
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('acros', '0040_auto_20200718_1920'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='wikipediaimage',
|
||||
name='attribution',
|
||||
field=models.TextField(blank=True, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='wikipediaimage',
|
||||
name='license_url',
|
||||
field=models.URLField(blank=True, null=True),
|
||||
),
|
||||
]
|
54
acros/models/WikipediaImage.py
Normal file
54
acros/models/WikipediaImage.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
from tempfile import TemporaryFile
|
||||
|
||||
import requests
|
||||
from django.core.files import File
|
||||
from django.db import models
|
||||
|
||||
from acros.utils.apis import WikipediaImageAPIObject
|
||||
|
||||
|
||||
class WikipediaImage(models.Model):
|
||||
filename = models.CharField(max_length=200)
|
||||
pageid = models.IntegerField()
|
||||
thumbnail = models.ImageField(upload_to="wikipedia_images/", blank=True, null=True)
|
||||
thumb_width = models.IntegerField(blank=True, editable=False, null=True)
|
||||
thumb_height = models.IntegerField(blank=True, editable=False, null=True)
|
||||
imageurl = models.URLField()
|
||||
caption = models.CharField(max_length=1000, null=True, blank=True)
|
||||
credit = models.TextField()
|
||||
artist = models.TextField()
|
||||
license_short_name = models.TextField()
|
||||
attribution = models.TextField(null=True, blank=True)
|
||||
license_url = models.URLField(null=True, blank=True)
|
||||
attribution_required = models.BooleanField()
|
||||
copyrighted = models.BooleanField()
|
||||
timestamp = models.DateTimeField(blank=True)
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
img = WikipediaImageAPIObject(self.filename)
|
||||
with TemporaryFile("rb+") as fd:
|
||||
r = requests.get(img.thumburl)
|
||||
for chunk in r.iter_content(chunk_size=128):
|
||||
fd.write(chunk)
|
||||
image_file = File(fd)
|
||||
self.thumbnail.save(self.filename, image_file, save=False)
|
||||
self.thumb_width, self.thumb_height = img.thumb_size
|
||||
self.pageid = img.pageid
|
||||
self.imageurl = img.url
|
||||
self.credit = img.credit
|
||||
self.artist = img.artist
|
||||
self.license_short_name = img.license_short_name
|
||||
self.attribution = img.attribution
|
||||
self.license_url = img.license_url
|
||||
self.attribution_required = img.attribution_required
|
||||
self.copyrighted = img.copyrighted
|
||||
self.timestamp = img.timestamp
|
||||
|
||||
super(WikipediaImage, self).save(*args, **kwargs)
|
||||
|
||||
@property
|
||||
def commons_url(self):
|
||||
return f"https://commons.wikimedia.org/wiki/File:{self.filename}"
|
||||
|
||||
def __str__(self):
|
||||
return self.filename
|
|
@ -1,12 +1,8 @@
|
|||
from tempfile import TemporaryFile
|
||||
|
||||
import requests
|
||||
from django.core.files import File
|
||||
from django.db import models
|
||||
from simple_history.models import HistoricalRecords
|
||||
|
||||
from acros.models import Acronym
|
||||
from acros.utils.apis import fetch_wikipedia_summary
|
||||
from acros.models import Acronym, WikipediaImage
|
||||
from acros.utils.apis import WikipediaAPISummary
|
||||
|
||||
|
||||
class WikipediaLink(models.Model):
|
||||
|
@ -14,28 +10,27 @@ class WikipediaLink(models.Model):
|
|||
title = models.CharField(max_length=200)
|
||||
extract = models.TextField(blank=True)
|
||||
extract_html = models.TextField(blank=True)
|
||||
thumbnail = models.ImageField(upload_to="wikipedia_thumbnails/", blank=True, null=True,
|
||||
height_field="thumbnail_height", width_field="thumbnail_width")
|
||||
thumbnail_width = models.IntegerField(blank=True, editable=False, null=True)
|
||||
thumbnail_height = models.IntegerField(blank=True, editable=False, null=True)
|
||||
thumbnail_title = models.CharField(max_length=100, null=True, blank=True)
|
||||
thumbnail_caption = models.CharField(max_length=1000, null=True, blank=True)
|
||||
thumbnail = models.ForeignKey(WikipediaImage, on_delete=models.CASCADE, related_name="wiki_articles",
|
||||
blank=True, null=True)
|
||||
timestamp = models.DateTimeField(blank=True)
|
||||
fetched = models.BooleanField(default=False)
|
||||
history = HistoricalRecords()
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
if not self.fetched:
|
||||
self.extract, self.extract_html, self.timestamp, thumbnail_url, \
|
||||
self.thumbnail_title, self.thumbnail_caption = fetch_wikipedia_summary(self.title)
|
||||
if thumbnail_url:
|
||||
with TemporaryFile("rb+") as fd:
|
||||
r = requests.get(thumbnail_url)
|
||||
filename = thumbnail_url.split("/")[-1]
|
||||
for chunk in r.iter_content(chunk_size=128):
|
||||
fd.write(chunk)
|
||||
image_file = File(fd)
|
||||
self.thumbnail.save(filename, image_file, save=False)
|
||||
summary = WikipediaAPISummary(self.title)
|
||||
self.extract = summary.extract
|
||||
self.extract_html = summary.extract_html
|
||||
self.timestamp = summary.timestamp
|
||||
self.title = summary.title
|
||||
if summary.image:
|
||||
filename = summary.image.split("/")[-1]
|
||||
try:
|
||||
thumbnail = WikipediaImage.objects.get(filename=filename)
|
||||
except WikipediaImage.DoesNotExist:
|
||||
thumbnail = WikipediaImage.objects.create(filename=filename)
|
||||
thumbnail.save()
|
||||
self.thumbnail = thumbnail
|
||||
self.fetched = True
|
||||
|
||||
super(WikipediaLink, self).save(*args, **kwargs)
|
||||
|
@ -44,9 +39,5 @@ class WikipediaLink(models.Model):
|
|||
def url(self):
|
||||
return f"https://en.wikipedia.org/wiki/{self.title}"
|
||||
|
||||
@property
|
||||
def thumbnail_wiki_url(self):
|
||||
return f"https://en.wikipedia.org/wiki/{self.thumbnail_title}"
|
||||
|
||||
def __str__(self):
|
||||
return self.title
|
||||
|
|
|
@ -4,4 +4,5 @@ from .AcroOfTheDay import AcroOfTheDay
|
|||
from .Host import Host
|
||||
from .PaperReference import PaperReference
|
||||
from .Weblink import Weblink
|
||||
from .WikipediaImage import WikipediaImage
|
||||
from .WikipediaLink import WikipediaLink
|
||||
|
|
|
@ -62,9 +62,8 @@
|
|||
<div class="card">
|
||||
{% if link.thumbnail %}
|
||||
<a href="{{ link.url }}">
|
||||
<img src="{{ link.thumbnail.url }}" class="card-img-top"
|
||||
width="{{ link.thumbnail_width }}" height="{{ link.thumbnail_height }}"
|
||||
{% if link.thumbnail_caption %}title="{{ link.thumbnail_caption }}"{% endif %}>
|
||||
<img src="{{ link.thumbnail.thumbnail.url }}" class="card-img-top"
|
||||
width="{{ link.thumbnail.thumbnail_width }}" height="{{ link.thumbnail.thumbnail_height }}">
|
||||
</a>
|
||||
{% endif %}
|
||||
<div class="card-body">
|
||||
|
@ -81,7 +80,10 @@
|
|||
title="This snippet and thumbnail is from the English Wikipedia and licensed under the CC-BY-SA 3.0 license">
|
||||
License
|
||||
</a>
|
||||
<a href="{{ link.thumbnail_wiki_url }}" target="_blank" rel="noopener" class="text-muted">
|
||||
<a href="{{ link.thumbnail.commons_url }}" target="_blank" rel="noopener"
|
||||
class="text-muted"
|
||||
data-toggle="tooltip"
|
||||
title="by {{ link.thumbnail.artist }} under {{ link.thumbnail.license_short_name }}">
|
||||
Image-Source
|
||||
</a>
|
||||
</div>
|
||||
|
|
|
@ -1,26 +1,40 @@
|
|||
from typing import Tuple, Optional
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from acros.utils.html import clean_html, string_to_bool
|
||||
|
||||
def fetch_wikipedia_summary(title: str):
|
||||
r = requests.get("https://en.wikipedia.org/api/rest_v1/page/summary/" + title)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
# print(data)
|
||||
|
||||
r2 = requests.get("https://en.wikipedia.org/api/rest_v1/page/media-list/" + title)
|
||||
r2.raise_for_status()
|
||||
image_data = r2.json()["items"]
|
||||
if len(image_data) > 0:
|
||||
image_title = image_data[0]["title"]
|
||||
image_caption = image_data[0]["caption"]["text"] if "caption" in image_data else None
|
||||
else:
|
||||
image_title = image_caption = None
|
||||
return (
|
||||
data["extract"], data["extract_html"], data["timestamp"],
|
||||
data["thumbnail"]["source"] if "thumbnail" in data else None,
|
||||
image_title, image_caption
|
||||
)
|
||||
class WikipediaAPISummary:
|
||||
urlbase = "https://en.wikipedia.org/api/rest_v1/page/summary/"
|
||||
|
||||
def __init__(self, title: str):
|
||||
r = requests.get(self.urlbase + title)
|
||||
r.raise_for_status()
|
||||
self.data = r.json()
|
||||
|
||||
@property
|
||||
def title(self) -> str:
|
||||
return self.data["title"]
|
||||
|
||||
@property
|
||||
def extract(self) -> str:
|
||||
return self.data["extract"]
|
||||
|
||||
@property
|
||||
def extract_html(self) -> str:
|
||||
return self.data["extract_html"]
|
||||
|
||||
@property
|
||||
def timestamp(self) -> str:
|
||||
return self.data["timestamp"]
|
||||
|
||||
@property
|
||||
def image(self) -> Optional[str]:
|
||||
if "originalimage" in self.data:
|
||||
return self.data["originalimage"]["source"]
|
||||
return None
|
||||
|
||||
|
||||
def get_website_title(url: str) -> str:
|
||||
|
@ -29,3 +43,90 @@ def get_website_title(url: str) -> str:
|
|||
soup = BeautifulSoup(r.text, features="html.parser")
|
||||
title = soup.find("title")
|
||||
return title.text
|
||||
|
||||
|
||||
class WikipediaImageAPIObject:
|
||||
def __init__(self, filename: str):
|
||||
self.filename = filename
|
||||
print(self.api_url)
|
||||
r = requests.get(self.api_url)
|
||||
r.raise_for_status()
|
||||
self.data = r.json()
|
||||
self.image_obj = list(self.data["query"]["pages"].values())[0]
|
||||
|
||||
@classmethod
|
||||
def from_url(cls, url: str):
|
||||
return cls(url.split("/")[-1])
|
||||
|
||||
@property
|
||||
def api_url(self):
|
||||
return "https://commons.wikimedia.org/w/api.php" \
|
||||
"?action=query" \
|
||||
"&format=json" \
|
||||
f"&titles=File:{self.filename}" \
|
||||
"&prop=imageinfo" \
|
||||
"&iiprop=extmetadata|size|url|timestamp" \
|
||||
"&iiurlwidth=500"
|
||||
|
||||
@property
|
||||
def pageid(self) -> int:
|
||||
return self.image_obj["pageid"]
|
||||
|
||||
@property
|
||||
def imageinfo(self):
|
||||
return self.image_obj["imageinfo"][0]
|
||||
|
||||
@property
|
||||
def timestamp(self) -> str:
|
||||
return self.imageinfo["timestamp"]
|
||||
|
||||
@property
|
||||
def thumb_size(self) -> Tuple[int, int]:
|
||||
return self.imageinfo["thumbwidth"], self.imageinfo["thumbheight"]
|
||||
|
||||
@property
|
||||
def url(self) -> str:
|
||||
return self.imageinfo["url"]
|
||||
|
||||
@property
|
||||
def thumburl(self) -> str:
|
||||
return self.imageinfo["thumburl"]
|
||||
|
||||
@property
|
||||
def extmetadata(self):
|
||||
return self.imageinfo["extmetadata"]
|
||||
|
||||
@property
|
||||
def image_description(self) -> str:
|
||||
return clean_html(self.extmetadata["ImageDescription"]["value"])
|
||||
|
||||
@property
|
||||
def credit(self) -> str:
|
||||
print(self.extmetadata["Credit"])
|
||||
return clean_html(self.extmetadata["Credit"]["value"])
|
||||
|
||||
@property
|
||||
def artist(self) -> str:
|
||||
return clean_html(self.extmetadata["Artist"]["value"])
|
||||
|
||||
@property
|
||||
def license_short_name(self) -> str:
|
||||
return self.extmetadata["LicenseShortName"]["value"]
|
||||
|
||||
@property
|
||||
def license_url(self) -> Optional[str]:
|
||||
if "LicenseUrl" in self.extmetadata:
|
||||
return self.extmetadata["LicenseUrl"]["value"]
|
||||
|
||||
@property
|
||||
def attribution_required(self) -> bool:
|
||||
return string_to_bool(self.extmetadata["AttributionRequired"]["value"])
|
||||
|
||||
@property
|
||||
def copyrighted(self) -> bool:
|
||||
return string_to_bool(self.extmetadata["Copyrighted"]["value"])
|
||||
|
||||
@property
|
||||
def attribution(self) -> Optional[str]:
|
||||
if "Attribution" in self.extmetadata:
|
||||
return self.extmetadata["Attribution"]["value"]
|
||||
|
|
9
acros/utils/html.py
Normal file
9
acros/utils/html.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def clean_html(html: str) -> str:
|
||||
return BeautifulSoup(html, "html.parser").text
|
||||
|
||||
|
||||
def string_to_bool(string: str) -> bool:
|
||||
return string.lower() in ["true"]
|
Loading…
Reference in a new issue