CoreNlpSerializer.java
/*******************************************************************************
* Copyright (C) 2020 Ram Sadasiv
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package io.outofprintmagazine.nlp.pipeline.serializers;
import java.util.Map;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.coref.data.CorefChain.CorefMention;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.CoreDocument;
import edu.stanford.nlp.pipeline.CoreQuote;
import edu.stanford.nlp.pipeline.CoreSentence;
import edu.stanford.nlp.pipeline.QuoteAttributionAnnotator;
import io.outofprintmagazine.nlp.pipeline.OOPAnnotations.OOPThumbnailAnnotation;
/**
* <p>Creates the base json syntax tree from coreDocument.</p>
* <ul>
* <li>root (document)
* <ul>
* <li>metadata
* <ul>
* <li>DocIDAnnotation</li>
* <li>DocTitleAnnotation</li>
* <li>DocSourceTypeAnnotation</li>
* <li>DocTypeAnnotation</li>
* <li>AuthorAnnotation</li>
* <li>DocDateAnnotation</li>
* <li>OOPThumbnailAnnotation</li>
* </ul>
* </li>
* <li>corefs</li>
* <li>quotes</li>
* <li>sentences
* <ul>
* <li>tokens
* <ul>
* <li>tokenIndex</li>
* <li>TokensAnnotation
* <ul>
* <li>word</li>
* <li>originalText</li>
* <li>lemma</li>
* <li>characterOffsetBegin</li>
* <li>characterOffsetEnd</li>
* <li>pos</li>
* <li>ner</li>
* <li>before</li>
* <li>after</li>
* </ul>
* </li>
* </ul>
* </li>
* </ul>
* </li>
* </ul>
* </li>
* </ul>
* @author Ram Sadasiv
*
*/
public class CoreNlpSerializer implements ISerializer {
ObjectMapper mapper = new ObjectMapper();
ObjectMapper getMapper() {
return mapper;
}
public CoreNlpSerializer() {
super();
}
protected void serializeMetadata(CoreDocument document, ObjectNode documentNode) {
ObjectNode metadata = documentNode.putObject("metadata");
if (document.annotation().containsKey(CoreAnnotations.DocIDAnnotation.class)) {
metadata.put(CoreAnnotations.DocIDAnnotation.class.getSimpleName(),
document.annotation().get(CoreAnnotations.DocIDAnnotation.class));
}
if (document.annotation().containsKey(CoreAnnotations.DocTitleAnnotation.class)) {
metadata.put(CoreAnnotations.DocTitleAnnotation.class.getSimpleName(),
document.annotation().get(CoreAnnotations.DocTitleAnnotation.class));
}
if (document.annotation().containsKey(CoreAnnotations.DocSourceTypeAnnotation.class)) {
metadata.put(CoreAnnotations.DocSourceTypeAnnotation.class.getSimpleName(),
document.annotation().get(CoreAnnotations.DocSourceTypeAnnotation.class));
}
if (document.annotation().containsKey(CoreAnnotations.DocTypeAnnotation.class)) {
metadata.put(CoreAnnotations.DocTypeAnnotation.class.getSimpleName(),
document.annotation().get(CoreAnnotations.DocTypeAnnotation.class));
}
if (document.annotation().containsKey(CoreAnnotations.AuthorAnnotation.class)) {
metadata.put(CoreAnnotations.AuthorAnnotation.class.getSimpleName(),
document.annotation().get(CoreAnnotations.AuthorAnnotation.class));
}
if (document.annotation().containsKey(CoreAnnotations.DocDateAnnotation.class)) {
metadata.put(CoreAnnotations.DocDateAnnotation.class.getSimpleName(),
document.annotation().get(CoreAnnotations.DocDateAnnotation.class));
}
if (document.annotation().containsKey(OOPThumbnailAnnotation.class)) {
metadata.put(OOPThumbnailAnnotation.class.getSimpleName(),
document.annotation().get(OOPThumbnailAnnotation.class));
}
}
protected void serializeCoref(CoreDocument document, ObjectNode documentNode) {
ArrayNode corefsNode = documentNode.putArray("corefs");
Map<Integer, CorefChain> corefChains = document.annotation().get(CorefCoreAnnotations.CorefChainAnnotation.class);
if (corefChains != null) {
for (CorefChain chain : corefChains.values()) {
CorefChain.CorefMention representative = chain.getRepresentativeMention();
ObjectNode chainNode = getMapper().createObjectNode();
corefsNode.add(chainNode);
ArrayNode mentionListNode = chainNode.putArray(Integer.toString(chain.getChainID()));
for (CorefMention mention : chain.getMentionsInTextualOrder()) {
ObjectNode mentionNode = mapper.createObjectNode();
mentionNode.put("id", mention.mentionID);
mentionNode.put("text", mention.mentionSpan);
mentionNode.put("type", mention.mentionType.toString());
mentionNode.put("number", mention.number.toString());
mentionNode.put("gender", mention.gender.toString());
mentionNode.put("animacy", mention.animacy.toString());
mentionNode.put("startIndex", mention.startIndex);
mentionNode.put("endIndex", mention.endIndex);
mentionNode.put("headIndex", mention.headIndex);
mentionNode.put("sentNum", mention.sentNum);
// mentionWriter.set("position",
// Arrays.stream(mention.position.elems()).boxed().collect(Collectors.toList()));
mentionNode.put("isRepresentativeMention", mention == representative);
mentionListNode.add(mentionNode);
}
}
}
}
public void serializeQuotes(CoreDocument document, ObjectNode documentNode) {
ArrayNode quotesNode = documentNode.putArray("quotes");
for (CoreQuote quote : document.quotes()) {
ObjectNode quoteNode = mapper.createObjectNode();
quoteNode.put("id", quote.coreMap().get(CoreAnnotations.QuotationIndexAnnotation.class));
quoteNode.put("text", quote.coreMap().get(CoreAnnotations.TextAnnotation.class));
quoteNode.put("beginIndex", quote.coreMap().get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
quoteNode.put("endIndex", quote.coreMap().get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
quoteNode.put("beginToken", quote.coreMap().get(CoreAnnotations.TokenBeginAnnotation.class));
quoteNode.put("endToken", quote.coreMap().get(CoreAnnotations.TokenEndAnnotation.class));
quoteNode.put("beginSentence", quote.coreMap().get(CoreAnnotations.SentenceBeginAnnotation.class));
quoteNode.put("endSentence", quote.coreMap().get(CoreAnnotations.SentenceEndAnnotation.class));
quoteNode.put("speaker",
quote.coreMap().get(QuoteAttributionAnnotator.SpeakerAnnotation.class) != null
? quote.coreMap().get(QuoteAttributionAnnotator.SpeakerAnnotation.class)
: "Unknown");
quoteNode.put("canonicalSpeaker",
quote.coreMap().get(QuoteAttributionAnnotator.CanonicalMentionAnnotation.class) != null
? quote.coreMap().get(QuoteAttributionAnnotator.CanonicalMentionAnnotation.class)
: "Unknown");
quotesNode.add(quoteNode);
}
}
@Override
public void serialize(CoreDocument document, ObjectNode documentNode) {
serializeMetadata(document, documentNode);
serializeCoref(document, documentNode);
serializeQuotes(document, documentNode);
//crud - paragraphs not going to work easily.
//Just use Tokens, sentences, document
/*
* ArrayNode paragraphsList = documentNode.putArray("paragraphs"); Integer
* paragraphIndex = new Integer(-1); ObjectNode paragraphNode = null; ArrayNode
* sentencesList = null; for (CoreSentence sentence : document.sentences()) {
*
* if (!sentence.coreMap().get(CoreAnnotations.ParagraphIndexAnnotation.class).
* equals(paragraphIndex)) { paragraphIndex =
* sentence.coreMap().get(CoreAnnotations.ParagraphIndexAnnotation.class);
* paragraphNode = mapper.createObjectNode();
* paragraphNode.put("paragraphIndex", paragraphIndex);
* paragraphsList.add(paragraphNode); sentencesList =
* paragraphNode.putArray("sentences"); }
*/
ArrayNode sentencesList = documentNode.putArray("sentences");
for (CoreSentence sentence : document.sentences()) {
ObjectNode sentenceNode = mapper.createObjectNode();
sentencesList.add(sentenceNode);
sentenceNode.put(CoreAnnotations.SentenceIndexAnnotation.class.getSimpleName(),
sentence.coreMap().get(CoreAnnotations.SentenceIndexAnnotation.class));
sentenceNode.put("text", sentence.text().replace("\"", ""));
ArrayNode tokensList = sentenceNode.putArray("tokens");
for (CoreLabel token : sentence.tokens()) {
ObjectNode tokenNode = mapper.createObjectNode();
tokensList.add(tokenNode);
tokenNode.put("tokenIndex", token.index());
ObjectNode coreNlpTokenAnnotationsNode = mapper.createObjectNode();
tokenNode.set(CoreAnnotations.TokensAnnotation.class.getSimpleName(), coreNlpTokenAnnotationsNode);
coreNlpTokenAnnotationsNode.put("word", token.word());
coreNlpTokenAnnotationsNode.put("originalText", token.originalText());
coreNlpTokenAnnotationsNode.put("lemma", token.lemma());
coreNlpTokenAnnotationsNode.put("characterOffsetBegin", token.beginPosition());
coreNlpTokenAnnotationsNode.put("characterOffsetEnd", token.endPosition());
coreNlpTokenAnnotationsNode.put("pos", token.tag());
coreNlpTokenAnnotationsNode.put("ner", token.ner());
coreNlpTokenAnnotationsNode.put("before", token.get(CoreAnnotations.BeforeAnnotation.class));
coreNlpTokenAnnotationsNode.put("after", token.get(CoreAnnotations.AfterAnnotation.class));
}
}
}
public static void main(String[] argv) {
System.out.println(CoreAnnotations.DocIDAnnotation.class.getSimpleName());
}
//aggregate must be the document itself?
@Override
public void serializeAggregate(Object aggregate, ObjectNode json) {
serializeMetadata((CoreDocument)aggregate, json);
}
}