Analyzer.java

/*******************************************************************************
 * Copyright (C) 2020 Ram Sadasiv
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package io.outofprintmagazine.nlp;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.CoreDocument;
import edu.stanford.nlp.pipeline.JSONOutputter;
import io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPThumbnailAnnotation;
import io.outofprintmagazine.nlp.pipeline.annotators.IOOPAnnotator;
import io.outofprintmagazine.nlp.pipeline.annotators.RunnableOOPAnnotator;
import io.outofprintmagazine.nlp.pipeline.serializers.CoreNlpSerializer;
import io.outofprintmagazine.nlp.utils.CoreNlpUtils;
import io.outofprintmagazine.util.IParameterStore;


/**
 * <p>The main library execution entry point for oopcorenlp.</p>
 * <p>analyze runs the coreNLP annotators, runs the custom annotators, serializes the annotation tree, and returns four JsonDocuments:</p> 
 * <ul>
 * 	<li>STANFORD</li>
 * 	<li>OOP</li>
 * 	<li>AGGREGATES</li>
 *  <li>PIPELINE</li>
 * </ul>
 * @see CoreNlpUtils
 * @see IOOPAnnotator
 * @see CoreNlpSerializer
 * @see io.outofprintmagazine.nlp.pipeline.serializers.ISerializer
 * @author Ram Sadasiv
 */
public class Analyzer {

	@SuppressWarnings("unused")
	private static final Logger logger = LogManager.getLogger(Analyzer.class);
	
	private Logger getLogger() {
		return logger;
	}
	
	private ArrayList<IOOPAnnotator> customAnnotators = new ArrayList<IOOPAnnotator>();
	private IParameterStore parameterStore;
	
	public Analyzer(IParameterStore parameterStore, List<String> annotatorClassNames) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException {
		super();
		this.parameterStore = parameterStore;
		//order is important
		for (String customAnnotatorClassName : customAnnotatorClassNames) {
			for (String annotatorClassName : annotatorClassNames) {
				if (annotatorClassName.equals(customAnnotatorClassName)) {
					logger.debug("Adding CustomAnnotator: " + annotatorClassName);
					Object annotator = Class.forName(annotatorClassName).getConstructor().newInstance();
					if (annotator instanceof IOOPAnnotator) {
						IOOPAnnotator oopAnnotator = (IOOPAnnotator) annotator;
						oopAnnotator.init(parameterStore);
						customAnnotators.add(oopAnnotator);
					}
				}
				continue;
			}
		}
	}
	
	public ArrayList<IOOPAnnotator> getCustomAnnotators() {
		return customAnnotators;
	}
	
	/**
	 * 
	 */
	public static List<String> customAnnotatorClassNames = Arrays.asList(
			"io.outofprintmagazine.nlp.pipeline.annotators.BiberAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.CoreNlpParagraphAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.CoreNlpGenderAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.GenderAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.PronounAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.count.CharCountAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.count.ParagraphCountAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.count.SentenceCountAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.count.SyllableCountAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.count.TokenCountAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.count.WordCountAnnotator"	    		
    		, "io.outofprintmagazine.nlp.pipeline.annotators.CoreNlpSentimentAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.VaderSentimentAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.VerbTenseAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.PunctuationMarkAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.AdjectivesAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.PointlessAdjectivesAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.AdjectiveCategoriesAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.AdverbsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.PointlessAdverbsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.AdverbCategoriesAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.PossessivesAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.PrepositionCategoriesAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.PrepositionsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.VerbsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.ActionlessVerbsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.NounsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.TopicsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.SVOAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.NonAffirmativeAnnotator"	    		
    		, "io.outofprintmagazine.nlp.pipeline.annotators.simile.LikeAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.simile.AsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.ColorsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.FlavorsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.VerblessSentencesAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.WordlessWordsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.WordnetGlossAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.PerfecttenseAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.UncommonWordsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.CommonWordsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.FunctionWordsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.AngliciseAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.AmericanizeAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.VerbGroupsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.VerbnetGroupsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.NounGroupsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.TemporalNGramsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.interrogative.WhoAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.interrogative.WhatAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.interrogative.WhenAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.interrogative.WhereAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.interrogative.WhyAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.interrogative.HowAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.LocationsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.PeopleAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.MyersBriggsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.DatesAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.conditional.IfAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.conditional.BecauseAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.QuotesAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.WordsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.FleschKincaidAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.VerbHypernymsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.NounHypernymsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.WikipediaGlossAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.WikipediaPageviewTopicsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.WikipediaCategoriesAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.BiberDimensionsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.ActorsAnnotator"
    		, "io.outofprintmagazine.nlp.pipeline.annotators.SettingsAnnotator"
	);
		
	private CoreDocument prepareDocument(Properties metadata, String text) {
		CoreDocument document = new CoreDocument(text);
		document.annotation().set(
				CoreAnnotations.DocIDAnnotation.class, 
				metadata.getProperty(
						CoreAnnotations.DocIDAnnotation.class.getSimpleName(), 
						"LittleNaomi"
						)
				);
		document.annotation().set(
				CoreAnnotations.DocTitleAnnotation.class, 
				metadata.getProperty(
						CoreAnnotations.DocTitleAnnotation.class.getSimpleName(), 
						"story title"
						)
				);
		document.annotation().set(
				CoreAnnotations.DocTypeAnnotation.class, 
				metadata.getProperty(
						CoreAnnotations.DocTypeAnnotation.class.getSimpleName(), 
						"Submissions"
						)
				);
		document.annotation().set(
				CoreAnnotations.DocSourceTypeAnnotation.class, 
				metadata.getProperty(
						CoreAnnotations.DocSourceTypeAnnotation.class.getSimpleName(), 
						"outofprintmagazine@gmail.com"
						)
				);		
		document.annotation().set(
				CoreAnnotations.AuthorAnnotation.class, 
				metadata.getProperty(
						CoreAnnotations.AuthorAnnotation.class.getSimpleName(), 
						"Author"
						)
				);
		document.annotation().set(
				CoreAnnotations.DocDateAnnotation.class, 
				metadata.getProperty(
						CoreAnnotations.DocDateAnnotation.class.getSimpleName(), 
						"9 January 2014 21:05"
						)
				);
		document.annotation().set(
				OOPThumbnailAnnotation.class, 
				metadata.getProperty(
						OOPThumbnailAnnotation.class.getSimpleName(), 
						"blank.png"
						)
				);
		return document;
	}
		
	private void runCoreNLP(CoreDocument document) throws IOException  {
		CoreNlpUtils.getInstance(parameterStore).getPipeline().annotate(document);
	}
	
	private void annotate(CoreDocument document) {
		List<RunnableOOPAnnotator> threadedAnnotators = new ArrayList<RunnableOOPAnnotator>();
		List<String> threadableAnnotators = Arrays.asList(
				"WordlessWordsAnnotator", "WikipediaGlossAnnotator", "WikipediaPageviewTopicsAnnotator", "WikipediaCategoriesAnnotator", "ActorsAnnotator", "SettingsAnnotator"
		);
		for (IOOPAnnotator annotator : customAnnotators) {
			long startTime = System.currentTimeMillis();
			if (threadableAnnotators.contains(annotator.getClass().getSimpleName())) {
				RunnableOOPAnnotator threadedAnnotator = new RunnableOOPAnnotator(annotator, document);
				threadedAnnotators.add(threadedAnnotator);
			}
			else {
				annotator.annotate(document.annotation());
				annotator.score(document);
				getLogger().info(String.format("%s %d ms", annotator.getClass().getSimpleName(), System.currentTimeMillis()-startTime));
			}
		}
		for (RunnableOOPAnnotator threadedAnnotator : threadedAnnotators) {
			threadedAnnotator.run();
		}
		long startTime = System.currentTimeMillis();
		try {
			for (RunnableOOPAnnotator threadedAnnotator : threadedAnnotators) {
				threadedAnnotator.join();
				getLogger().info(String.format("%s %d ms", threadedAnnotator.getAnnotator().getClass().getSimpleName(), System.currentTimeMillis()-startTime));
			}
		}
		catch (Exception e) {
			e.printStackTrace();
			getLogger().error(e);
		}
		
	}
	
	private void serialize(CoreDocument document, ObjectNode json)  {
		for (IOOPAnnotator annotator : customAnnotators) {
			annotator.serialize(document, json);
		}	
	}
	
	private void serializeAggregates(CoreDocument document, ObjectNode json) {
		for (IOOPAnnotator annotator : customAnnotators) {
			annotator.serializeAggregateDocument(document, json);
		}	
	}
		
	private void serializePipeline(ObjectNode json, long startTime, Properties properties) throws IOException {
		ArrayNode annotatorList = json.putArray("annotations");
		for (IOOPAnnotator annotator : customAnnotators) {
			annotatorList.addObject().put(annotator.getAnnotationClass().getSimpleName(), annotator.getDescription());
		}	

		ArrayNode coreNlpProperties = json.putArray("coreNlpProperties");
		Properties defaultProps = CoreNlpUtils.getInstance(parameterStore).getPipelineProps();
		for (String propertyName : defaultProps.stringPropertyNames()) {
			ObjectNode property = coreNlpProperties.addObject();
			property.put("name", propertyName);
			property.put("value", defaultProps.getProperty(propertyName));
		}
		
		ArrayNode customProperties = json.putArray("customProperties");
		for (String propertyName : properties.stringPropertyNames()) {
			ObjectNode property = customProperties.addObject();
			property.put("name", propertyName);
			property.put("value", properties.getProperty(propertyName));
		}

		ArrayNode analysisList = json.putArray("analysis");
		SimpleDateFormat fmt = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSSZ");
		analysisList.addObject().put("startTime", fmt.format(new java.util.Date(startTime)));
		long endTime = System.currentTimeMillis();
		analysisList.addObject().put("endTime", fmt.format(new java.util.Date(endTime)));
		analysisList.addObject().put("elapsedMS", Long.toString(endTime-startTime));
		String ipAddr = "localhost.localdomain/127.0.0.1";
		try {
			ipAddr = InetAddress.getLocalHost().toString();
		}
		catch (UnknownHostException e) {
		}
		analysisList.addObject().put("host", ipAddr);
	}
	
	public Map<String, ObjectNode> analyze(Properties metadata, String text) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException {

		ObjectMapper mapper = new ObjectMapper();
		long startTime = System.currentTimeMillis();
		CoreDocument document = prepareDocument(metadata, text);
		Map<String,ObjectNode> retval = new HashMap<String, ObjectNode>();
		
		try {
			logger.debug("analyzing " + metadata.getProperty(CoreAnnotations.DocIDAnnotation.class.getSimpleName()));
			
			runCoreNLP(document);
			retval.put("STANFORD", (ObjectNode) mapper.readTree(JSONOutputter.jsonPrint(document.annotation())));
			
			annotate(document);

			CoreNlpSerializer documentSerializer = new CoreNlpSerializer();
			
			ObjectNode json = mapper.createObjectNode();
			documentSerializer.serialize(document, json);
			serialize(document, json);
			retval.put("OOP", json);
			
			ObjectNode aggregates = mapper.createObjectNode();
			documentSerializer.serializeAggregate(document, aggregates);
			serializeAggregates(document, aggregates);
			retval.put("AGGREGATES", aggregates);

			ObjectNode pipeline = mapper.createObjectNode();
			serializePipeline(pipeline, startTime, metadata);
			retval.put("PIPELINE", pipeline);
			
		}
		catch (Exception e) {
			logger.error(e);
			e.printStackTrace();
		}
		return retval;
	}
	


}