Analyze.java

/*******************************************************************************
 * Copyright (C) 2020 Ram Sadasiv
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package io.outofprintmagazine.corpus.batch.impl;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;

import edu.stanford.nlp.ling.CoreAnnotations;
import io.outofprintmagazine.corpus.batch.CorpusBatchStep;
import io.outofprintmagazine.corpus.batch.ICorpusBatchStep;
import io.outofprintmagazine.nlp.Analyzer;
import io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPThumbnailAnnotation;

public class Analyze extends CorpusBatchStep implements ICorpusBatchStep {
	
	private static final Logger logger = LogManager.getLogger(Analyze.class);

	@SuppressWarnings("unused")
	private Logger getLogger() {
		return logger;
	}
	
	@Override
	public ObjectNode getDefaultProperties() {
		ObjectNode properties = getMapper().createObjectNode();
		ArrayNode customAnnotators = getMapper().createArrayNode();
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.BiberAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.CoreNlpParagraphAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.CoreNlpGenderAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.GenderAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.PronounAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.count.CharCountAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.count.ParagraphCountAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.count.SentenceCountAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.count.SyllableCountAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.count.TokenCountAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.count.WordCountAnnotator");	    		
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.CoreNlpSentimentAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.VaderSentimentAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.VerbTenseAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.PunctuationMarkAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.AdjectivesAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.PointlessAdjectivesAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.AdjectiveCategoriesAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.AdverbsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.PointlessAdverbsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.AdverbCategoriesAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.PossessivesAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.PrepositionCategoriesAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.PrepositionsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.VerbsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.ActionlessVerbsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.NounsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.TopicsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.SVOAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.NonAffirmativeAnnotator");	    		
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.simile.LikeAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.simile.AsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.ColorsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.FlavorsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.VerblessSentencesAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.WordlessWordsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.WordnetGlossAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.PerfecttenseAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.UncommonWordsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.CommonWordsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.FunctionWordsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.AngliciseAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.AmericanizeAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.VerbGroupsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.VerbnetGroupsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.NounGroupsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.TemporalNGramsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.interrogative.WhoAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.interrogative.WhatAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.interrogative.WhenAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.interrogative.WhereAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.interrogative.WhyAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.interrogative.HowAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.LocationsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.PeopleAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.MyersBriggsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.BiberDimensionsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.DatesAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.conditional.IfAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.conditional.BecauseAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.QuotesAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.WordsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.FleschKincaidAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.VerbHypernymsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.NounHypernymsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.WikipediaGlossAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.WikipediaPageviewTopicsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.WikipediaCategoriesAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.ActorsAnnotator");
		customAnnotators.add("io.outofprintmagazine.nlp.pipeline.annotators.SettingsAnnotator");
		properties.set("customAnnotators", customAnnotators);
		return properties;
	}
	
	private Analyzer ta = null;
	
	public Analyze() {
		super();
	}
	
	@Override
	public ArrayNode runOne(ObjectNode inputStepItem) throws Exception {
		if (ta == null) {
			List<String> customAnnotators = new ArrayList<>();
			Iterator<JsonNode> customAnnotatorsIter = ((ArrayNode) getData().getProperties().get("customAnnotators")).elements();
			while (customAnnotatorsIter.hasNext()){
			    customAnnotators.add(customAnnotatorsIter.next().asText());
			}
			ta = new Analyzer(getParameterStore(), customAnnotators);
		}
		ArrayNode retval = getMapper().createArrayNode();
		try {
			String doc = getTextDocumentFromStorage(inputStepItem);
			
			ObjectNode outputStepItem = copyInputToOutput(inputStepItem);
//			ObjectNode storageProperties = getMapper().createObjectNode();
//			//"Sun, 16 Feb 2020 23:17:38 GMT"
//			storageProperties.put("Content-Type", "text/plain");
//			storageProperties.put("mimeType", "text/plain");
//			storageProperties.put("charset",  "us-ascii");
//			storageProperties.put("Date", getDateFormat().format(new Date(System.currentTimeMillis())));
	
			Properties metadata = new Properties();
			metadata.put(CoreAnnotations.DocIDAnnotation.class.getSimpleName(), getDocID(inputStepItem));
			metadata.put(CoreAnnotations.DocTypeAnnotation.class.getSimpleName(), getData().getCorpusId());
			metadata.put(CoreAnnotations.AuthorAnnotation.class.getSimpleName(), getAuthor(inputStepItem));
			metadata.put(CoreAnnotations.DocDateAnnotation.class.getSimpleName(), getDate(inputStepItem));
			metadata.put(CoreAnnotations.DocTitleAnnotation.class.getSimpleName(), getTitle(inputStepItem));
			metadata.put(CoreAnnotations.DocSourceTypeAnnotation.class.getSimpleName(), getLink(inputStepItem));
			if (inputStepItem.has("oop_DocThumbnail")) {
				metadata.put(OOPThumbnailAnnotation.class.getSimpleName(), inputStepItem.get("oop_DocThumbnail").asText("blank.png"));
			}
			Iterator<String> fieldNamesIter = outputStepItem.fieldNames();
			while (fieldNamesIter.hasNext()) {
				String fieldName = fieldNamesIter.next();
				metadata.put(fieldName, outputStepItem.get(fieldName).asText());
			}
			//this should read the list of custom annotators from the properties
			Map<String,ObjectNode> json = ta.analyze(metadata, doc);

			outputStepItem.put(
					"oopNLPTextStorage",
					getStorage().storeScratchFileString(
						getData().getCorpusId(),
						getOutputScratchFilePath("TXT_" + getDocID(inputStepItem), "txt"),
						doc
					)
				);
			
			outputStepItem.put(
				"coreNLPStorage",
				getStorage().storeScratchFileObject(
					getData().getCorpusId(),
					getOutputScratchFilePath("STANFORD_" + getDocID(inputStepItem), "json"),
					json.get("STANFORD")
				)
			);
			
			outputStepItem.put(
				"oopNLPStorage",
				getStorage().storeScratchFileObject(
					getData().getCorpusId(),
					getOutputScratchFilePath("OOP_" + getDocID(inputStepItem), "json"),
					json.get("OOP")
				)
			);
			
			outputStepItem.put(
					"oopNLPAggregatesStorage",
					getStorage().storeScratchFileObject(
						getData().getCorpusId(),
						getOutputScratchFilePath("AGGREGATES_" + getDocID(inputStepItem), "json"),
						json.get("AGGREGATES")
				)
			);
			
			outputStepItem.put(
				"pipelineStorage",
				getStorage().storeScratchFileObject(
					getData().getCorpusId(),
					getOutputScratchFilePath("PIPELINE_" + getDocID(inputStepItem), "json"),
					json.get("PIPELINE")
				)
			);
	
			retval.add(outputStepItem);
		}
		catch (ClassNotFoundException e) {
			getLogger().error(e);
			throw new IOException(e);
		} 
		catch (InstantiationException e) {
			getLogger().error(e);
			throw new IOException(e);
		} 
		catch (IllegalAccessException e) {
			getLogger().error(e);
			throw new IOException(e);
		}
		return retval;
	}

}