ParseTOC.java
package io.outofprintmagazine.corpus.batch.impl.wikisource;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import io.outofprintmagazine.corpus.batch.CorpusBatchStep;
import io.outofprintmagazine.corpus.batch.ICorpusBatchStep;
public class ParseTOC extends CorpusBatchStep implements ICorpusBatchStep {
private static final Logger logger = LogManager.getLogger(ParseTOC.class);
@SuppressWarnings("unused")
private Logger getLogger() {
return logger;
}
public ParseTOC() {
super();
}
@Override
public ObjectNode getDefaultProperties() {
ObjectNode properties = getMapper().createObjectNode();
properties.put("esnlc_AuthorAnnotation", "div.gen_header_title #header_author_text");
properties.put("esnlc_DocDateAnnotation", "div.gen_header_title");
properties.put("esnlc_DocTitleAnnotation", "div.gen_header_title #header_title_text");
properties.put("selector", "div.mw-parser-output a");
return properties;
}
@Override
public ArrayNode runOne(ObjectNode inputStepItem) throws Exception {
ArrayNode retval = getMapper().createArrayNode();
Document doc = getJsoupDocumentFromStorage(inputStepItem);
Elements links = doc.select(getData().getProperties().get("selector").asText());
for (Element element : links) {
if (
element.hasAttr("href")
&& element.attr("href").startsWith("/wiki")
&& !element.attr("href").startsWith("/wiki/Author")
&& !element.attr("href").startsWith("/wiki/Special")
) {
ObjectNode outputStepItem = copyInputToOutput(inputStepItem);
setAuthor(doc, outputStepItem);
setDate(doc, outputStepItem);
outputStepItem.remove("stagingLinkStorage");
setLink("https://en.wikisource.org" + element.attr("href"), outputStepItem);
retval.add(outputStepItem);
}
}
return retval;
}
}