AbstractContextualAnnotator.java

/*******************************************************************************
 * Copyright (C) 2020 Ram Sadasiv
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package io.outofprintmagazine.nlp.pipeline.annotators;

import java.io.IOException;
import java.math.BigDecimal;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.flickr4java.flickr.FlickrException;

import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.pipeline.CoreDocument;
import edu.stanford.nlp.pipeline.CoreSentence;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.TypedDependency;
import io.outofprintmagazine.nlp.pipeline.ContextualAnnotation;
import io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPWikipediaGlossAnnotation;
import io.outofprintmagazine.nlp.pipeline.PhraseAnnotation;
import io.outofprintmagazine.nlp.utils.BingUtils;
import io.outofprintmagazine.nlp.utils.CoreNlpUtils;
import io.outofprintmagazine.nlp.utils.FlickrUtils;
import io.outofprintmagazine.nlp.utils.WikimediaUtils;

/**
 * <p>Base class for custom annotators that work with dependency trees (Core Nlp depparse).</p>
 * @author Ram Sadasiv
 *
 */
public abstract class AbstractContextualAnnotator extends AbstractPosAnnotator implements Annotator, IOOPAnnotator {
	
	private static final Logger logger = LogManager.getLogger(AbstractContextualAnnotator.class);
	
	@SuppressWarnings("unused")
	private Logger getLogger() {
		return logger;
	}
	
	public AbstractContextualAnnotator() {
		super();
	}
		
	@SuppressWarnings("rawtypes")
	public abstract Class getEntityAnnotationClass();

	protected abstract ContextualAnnotation getConcreteAnnotation();
	
	@SuppressWarnings("rawtypes")
	@Override
	public Set<Class<? extends CoreAnnotation>> requires() {
		Set<Class<? extends CoreAnnotation>> retval = new HashSet<Class<? extends CoreAnnotation>>();
		retval.addAll(
				Arrays.asList(
					CoreAnnotations.TextAnnotation.class, 
					CoreAnnotations.TokensAnnotation.class,
					CoreAnnotations.LemmaAnnotation.class,
					CoreAnnotations.SentencesAnnotation.class,
					CoreAnnotations.NamedEntityTagAnnotation.class,
					CoreAnnotations.NormalizedNamedEntityTagAnnotation.class,
					CoreAnnotations.CanonicalEntityMentionIndexAnnotation.class,
					CorefCoreAnnotations.CorefChainAnnotation.class,
					SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class,
					CoreAnnotations.QuotationsAnnotation.class,
					io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPLocationsAnnotation.class,
					io.outofprintmagazine.nlp.pipeline.OOPAnnotations.VaderSentimentAnnotation.class,
					io.outofprintmagazine.nlp.pipeline.OOPAnnotations.CoreNlpSentimentAnnotation.class,
					io.outofprintmagazine.nlp.pipeline.OOPAnnotations.CoreNlpGenderAnnotation.class,
					io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPGenderAnnotation.class,
					io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPVerbGroupsAnnotation.class,
					io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPVerbsAnnotation.class,
					io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPNounsAnnotation.class,
					io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPNounGroupsAnnotation.class,
					io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPAdjectivesAnnotation.class,
					io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPAdverbsAnnotation.class
					
				)

		);
		return retval;
	}
	
	@SuppressWarnings("unchecked")
	@Override
	public void annotate(Annotation annotation) {
		CoreDocument document = new CoreDocument(annotation);
		List<ContextualAnnotation> scoreList = new ArrayList<ContextualAnnotation>();
		List<PhraseAnnotation> entities = (List<PhraseAnnotation>) document.annotation().get(getEntityAnnotationClass());
		for (PhraseAnnotation phraseAnnotation : entities) {
	    	ContextualAnnotation contextualAnnotation = getConcreteAnnotation();
	    	contextualAnnotation.setCanonicalName(phraseAnnotation.getName());
	    	try {
	    		scoreDocument(document, contextualAnnotation);
	    		for (CoreSentence sentence: document.sentences()) {
	    			if (sentence.coreMap().containsKey(getEntityAnnotationClass())) {
		    			List<PhraseAnnotation> sentencePeople = (List<PhraseAnnotation>) sentence.coreMap().get(getEntityAnnotationClass());
		    			for (PhraseAnnotation sentenceAnnotation : sentencePeople) {
			    			if (sentenceAnnotation.getName().equals(phraseAnnotation.getName())) {
			    				scoreSentence(document, sentence, contextualAnnotation);
			    			}
		    			}
		    			for (CoreLabel token : sentence.tokens()) {
		    				if (token.containsKey(getEntityAnnotationClass())) {
			    				List<PhraseAnnotation> tokenPeople = (List<PhraseAnnotation>) token.get(getEntityAnnotationClass());
			    				for (PhraseAnnotation tokenAnnotation : tokenPeople ) {
					    			if (tokenAnnotation.getName().equals(phraseAnnotation.getName())) {
					    				scoreToken(document, token, contextualAnnotation);
					    			}
				    			}
		    				}
		    			}
	    			}
	    		}
	    	}
	    	catch (Exception e) {
	    		e.printStackTrace();
	    		getLogger().error(e);
	    	}
	    	if (contextualAnnotation.getImportance().compareTo(new BigDecimal(1)) > 0) {
	    		contextualAnnotation.setVaderSentimentAvg();
	    		contextualAnnotation.setCoreNlpSentimentAvg();
	    		scoreList.add(contextualAnnotation);
	    	}
	    }
	    document.annotation().set(getAnnotationClass(), scoreList);
	}
	
	@Override
	public void score(CoreDocument document) {
		//pass
	}
	
	@SuppressWarnings("unchecked")
	@Override
	public void serialize(CoreDocument document, ObjectNode json) {
		if (document.annotation().containsKey(getAnnotationClass())) {
			ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT);
			mapper.configure(com.fasterxml.jackson.core.JsonGenerator.Feature.WRITE_BIGDECIMAL_AS_PLAIN, true);
			json.set(getAnnotationClass().getSimpleName(), mapper.valueToTree(document.annotation().get(getAnnotationClass())));
		}
	}
	
	@Override
	public void serializeAggregateDocument(CoreDocument document, ObjectNode json) {
		//TODO
	}
	
	protected void scoreDocument(CoreDocument document, ContextualAnnotation annotation) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException, URISyntaxException, FlickrException {
		scoreThumbnails(annotation);
	}
	
	
	protected void scoreThumbnails(ContextualAnnotation annotation) throws IOException, URISyntaxException, FlickrException {
		if (getParameterStore().getProperty("azure_apiKey") != null) {
			annotation.getThumbnails().addAll(BingUtils.getInstance(getParameterStore()).getImagesByText(annotation.getCanonicalName()));
		}
		else if (getParameterStore().getProperty("flickr_apiKey") != null && getParameterStore().getProperty("faceplusplus_apiKey") != null) {
			annotation.getThumbnails().addAll(FlickrUtils.getInstance(getParameterStore()).getImagesByText(annotation.getCanonicalName()));
		}
		else {
			annotation.getThumbnails().addAll(WikimediaUtils.getInstance(getParameterStore()).getImagesByText(annotation.getCanonicalName()));
		}
		
	}
	
	
	protected void scoreSentence(CoreDocument document, CoreSentence sentence, ContextualAnnotation annotation) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException {
		annotation.addImportance(1);
		
		BigDecimal sentenceId = new BigDecimal(CoreNlpUtils.getInstance(getParameterStore()).getSentenceIdFromSentence(document, sentence));
		if (annotation.getFirstAppearance() < 0 ) {
			annotation.setFirstAppearance(sentenceId.intValue());
		}
		if (annotation.getLastAppearance() < sentenceId.intValue()) {
			annotation.setLastAppearance(sentenceId.intValue());
		}
		
		annotation.addVaderSentiment(sentence.coreMap().get(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.VaderSentimentAnnotation.class));
		annotation.addCoreNlpSentiment(sentence.coreMap().get(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.CoreNlpSentimentAnnotation.class));

	}
	
	protected void scoreToken(CoreDocument document, CoreLabel token, ContextualAnnotation annotation) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException {
		scoreDependencies(document, token, annotation);
		scoreWikipediaGloss(document, token, annotation);
	}

	
	protected void scoreWikipediaGloss(CoreDocument document, CoreLabel token, ContextualAnnotation annotation) {
		if (token.containsKey(OOPWikipediaGlossAnnotation.class)) {
			if (!annotation.getWikipediaGlosses().contains(token.get(OOPWikipediaGlossAnnotation.class))) {
				annotation.getWikipediaGlosses().add(token.get(OOPWikipediaGlossAnnotation.class));
			}
		}
	}
	
	protected void scoreSubAnnotation(Map<String, BigDecimal> annotationScore, Map<String, BigDecimal> existingScoreMap) {
		if (annotationScore != null) {
			for (String subAnnotationScoreName : annotationScore.keySet()) {
				BigDecimal existingScore = existingScoreMap.get(subAnnotationScoreName);
				if (existingScore == null) {
					existingScore = new BigDecimal(0);
				}
				existingScoreMap.put(subAnnotationScoreName, existingScore.add(annotationScore.get(subAnnotationScoreName)));
			}
		}
	}
	
	protected Map<String, BigDecimal> scoreVerbs(CoreLabel token, Map<String, BigDecimal> existingScoreMap) {
		if (token.containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPVerbsAnnotation.class)) {
			Map<String,BigDecimal> annotationScore = token.get(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPVerbsAnnotation.class);
			scoreSubAnnotation(annotationScore, existingScoreMap);
		}
		return existingScoreMap;		
	}
	
	protected Map<String, BigDecimal> scoreVerbGroups(CoreLabel token, Map<String, BigDecimal> existingScoreMap) {
		if (token.containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPVerbGroupsAnnotation.class)) {
			Map<String,BigDecimal> annotationScore = token.get(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPVerbGroupsAnnotation.class);
			scoreSubAnnotation(annotationScore, existingScoreMap);
		}
		return existingScoreMap;
	}
	
	protected Map<String, BigDecimal> scoreAdverbs(CoreLabel token, Map<String, BigDecimal> existingScoreMap) {
		if (token.containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPAdverbsAnnotation.class)) {
			Map<String,BigDecimal> annotationScore = token.get(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPAdverbsAnnotation.class);
			scoreSubAnnotation(annotationScore, existingScoreMap);
		}
		return existingScoreMap;
	}
	
	protected Map<String, BigDecimal> scoreAdjectives(CoreLabel token, Map<String, BigDecimal> existingScoreMap) {
		if (token.containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPAdjectivesAnnotation.class)) {
			Map<String,BigDecimal> annotationScore = token.get(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPAdjectivesAnnotation.class);
			scoreSubAnnotation(annotationScore, existingScoreMap);
		}
		return existingScoreMap;
	}
	
	protected Map<String, BigDecimal> scoreNouns(CoreLabel token, Map<String, BigDecimal> existingScoreMap) {
		if (token.containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPNounsAnnotation.class)) {
			Map<String,BigDecimal> annotationScore = token.get(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPNounsAnnotation.class);
			scoreSubAnnotation(annotationScore, existingScoreMap);
		}
		return existingScoreMap;
	}
	
	protected Map<String, BigDecimal> scoreNounGroups(CoreLabel token, Map<String, BigDecimal> existingScoreMap) {
		if (token.containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPNounsAnnotation.class)) {
			Map<String,BigDecimal> annotationScore = token.get(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPNounGroupsAnnotation.class);
			scoreSubAnnotation(annotationScore, existingScoreMap);
		}
		return existingScoreMap;
	}	
	
	protected void scoreDependencies(CoreDocument document, CoreLabel token, ContextualAnnotation annotation) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException {
		scoreDependenciesGov(document, token, annotation);
		scoreDependenciesDep(document, token, annotation);
	}

	
	protected void scoreDependenciesGov(CoreDocument document, CoreLabel token, ContextualAnnotation annotation) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException {
		List<TypedDependency> deps = CoreNlpUtils.getInstance(getParameterStore()).getTypedDependencyGovFromToken(document, token);
		for (TypedDependency dependency : deps) {
			GrammaticalRelation rn = dependency.reln();
			if (dependency.dep().backingLabel().containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPNounsAnnotation.class)) {
				annotation.getAttributes().put(
					"OOPNounsAnnotation",
					scoreNouns(
							dependency.dep().backingLabel(),
							annotation.getAttribute("OOPNounsAnnotation")
					)
				);
				annotation.getAttributes().put(
						"OOPNounGroupsAnnotation",
						scoreNounGroups(
								dependency.dep().backingLabel(),
								annotation.getAttribute("OOPNounGroupsAnnotation")
						)
					);
			}
			if (dependency.dep().backingLabel().containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPAdjectivesAnnotation.class)) {
				annotation.getAttributes().put(
					"OOPAdjectivesAnnotation",
					scoreAdjectives(
							dependency.dep().backingLabel(),
							annotation.getAttribute("OOPAdjectivesAnnotation")
					)
				);
			}
		}
	}
	
	
	protected void scoreDependenciesDep(CoreDocument document, CoreLabel token, ContextualAnnotation annotation) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException {
		List<TypedDependency> deps = CoreNlpUtils.getInstance(getParameterStore()).getTypedDependencyDepFromToken(document, token);
		for (TypedDependency dependency : deps) {
			GrammaticalRelation rn = dependency.reln();
			if (rn.getShortName().equals("nsubj")) {
				if (Arrays.asList("VB","VBD","VBG","VBN","VBP","VBZ").contains(dependency.gov().tag())) {
					//Subject of transitive verb. John walks -> John : walk
					if (dependency.gov().backingLabel().containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPVerbsAnnotation.class)) {
						annotation.getAttributes().put(
							"OOPVerbsAnnotation",
							scoreVerbs(
									dependency.gov().backingLabel(),
									annotation.getAttribute("OOPVerbsAnnotation")
							)
						);
					}
					if (dependency.gov().backingLabel().containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPVerbGroupsAnnotation.class)) {
						annotation.getAttributes().put(
							"OOPVerbGroupsAnnotation",
							scoreVerbGroups(
									dependency.gov().backingLabel(),
									annotation.getAttribute("OOPVerbGroupsAnnotation")
							)
						);
					}

					
					List<TypedDependency> transitiveDeps = CoreNlpUtils.getInstance(getParameterStore()).getTypedDependencyGovFromToken(document, CoreNlpUtils.getInstance(getParameterStore()).getTokenFromIndexedWord(document, dependency.gov()));
    				for (TypedDependency transitiveDependency : transitiveDeps) {
    					GrammaticalRelation transitiveRelation = transitiveDependency.reln();
    					if (transitiveRelation.getShortName().equals("advmod")) {
    						//Adverbial description. John walks quickly -> John : quickly
    						if (transitiveDependency.dep().backingLabel().containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPAdverbsAnnotation.class)) {
    							annotation.getAttributes().put(
    								"OOPAdverbsAnnotation",
    								scoreAdverbs(
    										transitiveDependency.dep().backingLabel(),
    										annotation.getAttribute("OOPAdverbsAnnotation")
    								)
    							);
    						}
    					}
    				}
				}
				else {
					//Object of intransitive verb. John was angry -> John : angry
					if (dependency.gov().backingLabel().containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPNounsAnnotation.class)) {
						annotation.getAttributes().put(
							"OOPNounsAnnotation",
							scoreNouns(
									dependency.gov().backingLabel(),
									annotation.getAttribute("OOPNounsAnnotation")
							)
						);
						annotation.getAttributes().put(
								"OOPNounGroupsAnnotation",
								scoreNounGroups(
										dependency.gov().backingLabel(),
										annotation.getAttribute("OOPNounGroupsAnnotation")
								)
							);
					}
					//Subject of intransitive verb. John was angry -> John : angry
					if (dependency.gov().backingLabel().containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPAdjectivesAnnotation.class)) {
						annotation.getAttributes().put(
							"OOPAdjectivesAnnotation",
							scoreAdjectives(
									dependency.gov().backingLabel(),
									annotation.getAttribute("OOPAdjectivesAnnotation")
							)
						);
					}
				}
			}
			
			if (rn.getShortName().equals("nmod:poss")) {
				//Possessive. John's hair -> John : hair.
				if (dependency.gov().backingLabel().containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPNounsAnnotation.class)) {
					annotation.getAttributes().put(
						"OOPNounsAnnotation",
						scoreNouns(
								dependency.gov().backingLabel(),
								annotation.getAttribute("OOPNounsAnnotation")
						)
					);
					annotation.getAttributes().put(
							"OOPNounGroupsAnnotation",
							scoreNounGroups(
									dependency.gov().backingLabel(),
									annotation.getAttribute("OOPNounGroupsAnnotation")
							)
						);
				};
				List<TypedDependency> transitiveDeps = CoreNlpUtils.getInstance(getParameterStore()).getTypedDependencyDepFromToken(document, CoreNlpUtils.getInstance(getParameterStore()).getTokenFromIndexedWord(document, dependency.gov()));
				for (TypedDependency transitiveDependency : transitiveDeps) {
					GrammaticalRelation transitiveRelation = transitiveDependency.reln();
					if (transitiveRelation.getShortName().equals("amod")) {
						//Possessive modifier.  John's black hair -> John : black
						if (dependency.gov().backingLabel().containsKey(io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPNounsAnnotation.class)) {
							annotation.getAttributes().put(
								"OOPAdjectivesAnnotation",
								scoreAdjectives(
										dependency.gov().backingLabel(),
										annotation.getAttribute("OOPAdjectivesAnnotation")
								)
							);
						}
					}
				}
			}
		}
	}

}