package lemmaextractor; //import edu.stanford.nlp.ie.AbstractSequenceClassifier; //import edu.stanford.nlp.ie.crf.CRFClassifier; //import edu.stanford.nlp.ling.CoreAnnotations; //import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; import lemmaextractor.results.Result; import java.util.ArrayList; import java.util.Properties; /** * Created by michal on 4/15/14. */ public class EnglishProcessor implements LanguageProcessor{ StanfordCoreNLP pipeline; public EnglishProcessor(String taggerModel, String nerModel){ Properties props = new Properties(); props.put("pos.model", taggerModel); props.put("ner.model", nerModel); props.put("annotators", "tokenize, ssplit, pos, lemma, ner"); pipeline = new StanfordCoreNLP(props); } public EnglishProcessor(String taggerModel){ Properties props = new Properties(); props.put("pos.model", taggerModel); props.put("annotators", "tokenize, ssplit, pos, lemma"); pipeline = new StanfordCoreNLP(props); } @Override public Result processFull(String text) throws Exception { Annotation document = new Annotation(text); pipeline.annotate(document); Result result = new Result("en"); int positionInText = 0; for(CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) { String annotationChannel = ""; String namedEntity = ""; String namedEntityBase = ""; ArrayList namedEntityIndices = new ArrayList(); for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { String word = token.get(CoreAnnotations.TextAnnotation.class); String lemma = token.get(CoreAnnotations.LemmaAnnotation.class); String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); int token_start = text.indexOf(word, positionInText); int token_end = token_start + word.length(); result.addLemma(lemma, token_start, token_end); positionInText = token_end; if(!ne.equals("O")){ if(ne.equals(annotationChannel)){ namedEntity += token.getString(CoreAnnotations.BeforeAnnotation.class) + word; namedEntityIndices.add(result.getLemmas().size() - 1); namedEntityBase += token.getString(CoreAnnotations.BeforeAnnotation.class) + lemma; } else{ if(!namedEntity.isEmpty()){ result.addNamedEntity(namedEntity, namedEntityBase, namedEntityIndices); } annotationChannel = ne; namedEntity = word; namedEntityBase = lemma; namedEntityIndices = new ArrayList(); namedEntityIndices.add(result.getLemmas().size() - 1); } } else{ if(!namedEntity.isEmpty()){ result.addNamedEntity(namedEntity, namedEntityBase, namedEntityIndices); annotationChannel = ""; namedEntity = ""; namedEntityBase = ""; namedEntityIndices = new ArrayList(); } } } } return result; } @Override public Result processLemmas(String text) { Annotation document = new Annotation(text); pipeline.annotate(document); Result result = new Result("en"); int positionInText = 0; for(CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { String word = token.get(CoreAnnotations.TextAnnotation.class); String lemma = token.get(CoreAnnotations.LemmaAnnotation.class); int token_start = text.indexOf(word, positionInText); int token_end = token_start + word.length(); result.addLemma(lemma, token_start, token_end); positionInText = token_end; } } return result; } }