WikimediaUtils.java
/*******************************************************************************
* Copyright (C) 2020 Ram Sadasiv
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package io.outofprintmagazine.nlp.utils;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.NameValuePair;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.fasterxml.jackson.databind.JsonNode;
import io.outofprintmagazine.util.IParameterStore;
public class WikimediaUtils {
private static final Logger logger = LogManager.getLogger(WikimediaUtils.class);
@SuppressWarnings("unused")
private Logger getLogger() {
return logger;
}
private static final int BATCH_SIZE = 20;
private IParameterStore parameterStore = null;
private WikimediaUtils(IParameterStore parameterStore) throws IOException {
this.parameterStore = parameterStore;
}
private static Map<IParameterStore, WikimediaUtils> instances = new HashMap<IParameterStore, WikimediaUtils>();
public static WikimediaUtils getInstance(IParameterStore parameterStore) throws IOException {
if (instances.get(parameterStore) == null) {
WikimediaUtils instance = new WikimediaUtils(parameterStore);
instances.put(parameterStore, instance);
}
return instances.get(parameterStore);
}
public List<String> getImagesByText(String text) throws IOException, URISyntaxException {
List<String> retval = getImages(text);
return retval;
}
public List<String> getImagesByTag(String text) throws IOException, URISyntaxException {
List<String> retval = getImages(text);
return retval;
}
//https://en.wikipedia.org/w/api.php?action=query&generator=images&titles=Coimbatore&prop=info
//https://en.wikipedia.org/w/api.php?action=query&titles=File:Coimbatore-TNSTC-JnNURM-Bus.JPG&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|dimensions|sha1|mime|thumbmime|mediatype|bitdepth
/*
https://en.wikipedia.org/w/api.php?format=json&action=query&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|dimensions|sha1|mime|thumbmime|mediatype|bitdepth&titles=File%3A2009-3-14_ManUtd_vs_LFC_Red_Card_Vidic.JPG|File%3AAU_Fire_Danger_Indicator.jpg|File%3AAlfa_Romeo_33_SC_12_Sovralimentata_1977_red_vr_TCE.jpg|File%3AAlizarin-sample.jpg|File%3AAmsterdam_red_light_district_24-7-2003.JPG|File%3AAt_the_Devil%27s_Ball_1.jpg|File%3AAztecheaddress.jpg|File%3ABoutet_1708_color_circles.jpg|File%3AAgarplate_redbloodcells_edit.jpg|File%3ABoschTheCrucifixionOfStJulia.jpg|File%3ABritish_home_and_distant_railway_semaphore_RYG_signals.svg|File%3AByzantine_imperial_flag%2C_14th_century%2C_square.svg|File%3ACan_Setter_dog_GFDL.jpg|File%3ACardinal.jpg|File%3ACardinal_Th%C3%A9odore_Adrien_Sarr_2.JPG|File%3ACherry_blossoms_in_the_Tsutsujigaoka_Park.jpg|File%3AChinese_honor_guard_in_column_070322-F-0193C-014.JPEG|File%3ACrimson_sunset.jpg|File%3ACommons-logo.svg|File%3AElizabeth_I_Steven_Van_Der_Meulen.jpg
https://en.wikipedia.org/w/api.php?format=json&action=query&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|dimensions|sha1|mime|thumbmime|mediatype|bitdepth&titles=File%3AA_Badge_Pinning.jpg|File%3ABadge_1012.jpg|File%3AChevalier_l%C3%A9gion_d%27honneur_2.png|File%3ADima-rs1.jpg|File%3ADispositif_d%27une_%C3%A9pingle_de_s%C3%BBret%C3%A9_sur_une_%C3%A9pinglette.JPG|File%3AGeneseeDABadge.jpg|File%3AGreenville%2C_North_Carolina_Police_Badge.jpg|File%3ACommons-logo.svg|File%3AFolder_Hexagonal_Icon.svg|File%3ANobel_Prize.png|File%3APeople_icon.svg|File%3AStar_of_the_Garter.png|File%3AQuestion_book-new.svg
https://en.wikipedia.org/w/api.php?format=json&action=query&prop=imageinfo&iiprop=timestamp|user|userid|comment|canonicaltitle|url|size|dimensions|sha1|mime|thumbmime|mediatype|bitdepth&titles=File:Disambig_gray.svg
*/
protected class ImagePageHandler implements IJsonResponseHandler {
List<String> retval = new ArrayList<String>();
public ImagePageHandler() {
super();
}
public List<String> getValues() {
return retval;
}
@Override
public void onPage(JsonNode page) {
if (page != null && page.has("query") && page.get("query").has("pages")) {
JsonNode pagesNode = page.get("query").get("pages");
Iterator<Entry<String, JsonNode>> pagesIter = pagesNode.fields();
while (pagesIter.hasNext()) {
String x = pagesIter.next().getValue().get("title").asText();
if (StringUtils.isAsciiPrintable(x)) {
retval.add(x);
}
}
}
}
}
protected class ImageInfoPageHandler implements IJsonResponseHandler {
List<String> retval = new ArrayList<String>();
public ImageInfoPageHandler() {
super();
}
public List<String> getValues() {
return retval;
}
@Override
public void onPage(JsonNode page) {
if (page != null && page.has("query") && page.get("query").has("pages")) {
JsonNode pagesNode = page.get("query").get("pages");
Iterator<Entry<String, JsonNode>> pagesIter = pagesNode.fields();
while (pagesIter.hasNext()) {
JsonNode pageNode = pagesIter.next().getValue().get("imageinfo").get(0);
if (pageNode.get("mediatype").asText().equalsIgnoreCase("BITMAP")) {
retval.add(pageNode.get("url").asText());
}
}
}
}
}
public List<String> getImages(String title) throws IOException, URISyntaxException {
if (!StringUtils.isAsciiPrintable(title)) {
return new ArrayList<String>();
}
ImagePageHandler imageHandler = new ImagePageHandler();
List<NameValuePair> imageParams = HttpUtils.getInstance(parameterStore).getWikimediaImageParameters();
title = title.replace(' ', '_');
imageParams.add(new BasicNameValuePair("titles", title));
List<Header> headers = new ArrayList<Header>();
headers.add(new BasicHeader("User-Agent", parameterStore.getProperty("wikipedia_apikey")));
HttpUtils.getInstance(parameterStore).httpGetJsonPaginated(
HttpUtils.getInstance(parameterStore).getWikipediaApi(),
imageParams,
headers,
"gimcontinue",
imageHandler
);
ImageInfoPageHandler imageInfoHandler = new ImageInfoPageHandler();
Iterator<String> imageTitlesIter = imageHandler.getValues().iterator();
while (imageTitlesIter.hasNext()) {
List<String> queries = new ArrayList<String>();
for (int i=0;i<BATCH_SIZE&&imageTitlesIter.hasNext();i++) {
String imageTitle = imageTitlesIter.next();
imageTitle = imageTitle.replace(' ', '_');
queries.add(imageTitle);
}
List<NameValuePair> infoParams = HttpUtils.getInstance(parameterStore).getWikimediaImageInfoParameters();
infoParams.add(new BasicNameValuePair("titles", String.join("|", queries)));
HttpUtils.getInstance(parameterStore).httpGetJsonCB(
HttpUtils.getInstance(parameterStore).buildUri(
HttpUtils.getInstance(parameterStore).getWikipediaApi(),
infoParams
),
headers,
imageInfoHandler
);
}
return imageInfoHandler.getValues();
}
}