ParseStory.java
package io.outofprintmagazine.corpus.batch.impl.wikisource;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.nodes.Document;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import io.outofprintmagazine.corpus.batch.CorpusBatchStep;
import io.outofprintmagazine.corpus.batch.ICorpusBatchStep;
public class ParseStory extends CorpusBatchStep implements ICorpusBatchStep {
private static final Logger logger = LogManager.getLogger(ParseStory.class);
@SuppressWarnings("unused")
private Logger getLogger() {
return logger;
}
@Override
public ObjectNode getDefaultProperties() {
ObjectNode properties = getMapper().createObjectNode();
properties.put("oop_Text", "div.mw-parser-output p");
properties.put("esnlc_AuthorAnnotation", "div.gen_header_title #header_author_text");
properties.put("esnlc_DocTitleAnnotation", "div.gen_header_title #header_title_text");
return properties;
}
public ParseStory() {
super();
}
@Override
public ArrayNode runOne(ObjectNode inputStepItem) throws Exception {
ArrayNode retval = getMapper().createArrayNode();
Document doc = getJsoupDocumentFromStorage(inputStepItem);
ObjectNode outputStepItem = copyInputToOutput(inputStepItem);
setTitle(doc, outputStepItem);
setAuthor(doc, outputStepItem);
setStorageLink(
getStorage().storeScratchFileString(
getData().getCorpusId(),
getOutputScratchFilePathFromInput(inputStepItem, "txt"),
getText(doc).toString().trim()
),
outputStepItem
);
retval.add(outputStepItem);
return retval;
}
}