ParseTOC.java
/*******************************************************************************
* Copyright (C) 2020 Ram Sadasiv
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package io.outofprintmagazine.corpus.batch.impl.ebook;
import java.io.BufferedReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import io.outofprintmagazine.corpus.batch.CorpusBatchStep;
import io.outofprintmagazine.corpus.batch.ICorpusBatchStep;
public class ParseTOC extends CorpusBatchStep implements ICorpusBatchStep {
private static final Logger logger = LogManager.getLogger(ParseTOC.class);
@SuppressWarnings("unused")
private Logger getLogger() {
return logger;
}
public ParseTOC() {
super();
}
@Override
public ArrayNode runOne(ObjectNode inputStepItem) throws Exception {
ArrayNode retval = getMapper().createArrayNode();
BufferedReader reader = null;
try {
List<String> tocCandidates = new ArrayList<String>();
//Story titles make up an entire lines in ALL CAPS or Title Case
//If there is a TOC, story titles will appear twice
Map<String, Integer> lineRepetitions = new HashMap<String, Integer>();
reader = new BufferedReader(
new StringReader(
getTextDocumentFromStorage(inputStepItem)
)
);
String line = null;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.length() > 1 && !line.contains(".") && !line.matches("^.*\\p{Punct}$") && !line.matches("^['\"\\u201C\\u201D\\u201E\\u201F\\u2033\\u2036].*")) {
if (line.toUpperCase().equals(line) || StringUtils.capitalize(line).equals(line)) {
Integer lineRepetitionCount = lineRepetitions.get(line);
if (lineRepetitionCount == null) {
lineRepetitions.put(line, Integer.valueOf(1));
}
else if (lineRepetitionCount.intValue() == 1) {
lineRepetitions.put(line, Integer.valueOf(lineRepetitionCount.intValue()+1));
tocCandidates.add(line);
}
else if (lineRepetitionCount.intValue() == 2) {
lineRepetitions.put(line, Integer.valueOf(lineRepetitionCount.intValue()+1));
tocCandidates.remove(line);
}
else {
lineRepetitions.put(line, Integer.valueOf(lineRepetitionCount.intValue()+1));
}
}
}
}
reader.close();
//If there is no TOC, story titles will appear once
if (tocCandidates.size() < 2) {
tocCandidates.clear();
lineRepetitions = new HashMap<String, Integer>();
reader = new BufferedReader(
new StringReader(
getTextDocumentFromStorage(inputStepItem)
)
);
line = null;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.length() > 1 && !line.contains(".") && !line.matches("^.*\\p{Punct}$") && !line.matches("^['\"\\u201C\\u201D\\u201E\\u201F\\u2033\\u2036].*")) {
if (line.toUpperCase().equals(line) || StringUtils.capitalize(line).equals(line)) {
Integer lineRepetitionCount = lineRepetitions.get(line);
if (lineRepetitionCount == null) {
lineRepetitions.put(line, Integer.valueOf(1));
tocCandidates.add(line);
}
else if (lineRepetitionCount.intValue() == 1) {
lineRepetitions.put(line, Integer.valueOf(lineRepetitionCount.intValue()+1));
tocCandidates.remove(line);
}
}
}
}
}
String tocCandidate = null;
for (String toc : tocCandidates) {
if (tocCandidate != null) {
ObjectNode outputStepItem = copyInputToOutput(inputStepItem);
setTitle(tocCandidate, outputStepItem);
outputStepItem.put(
"nextTitle",
toc
);
retval.add(outputStepItem);
}
tocCandidate = toc;
}
if (tocCandidate != null) {
ObjectNode outputStepItem = copyInputToOutput(inputStepItem);
setTitle(tocCandidate, outputStepItem);
retval.add(outputStepItem);
}
return retval;
}
finally {
if (reader != null) {
reader.close();
}
}
}
}