package lemmaextractor; import lemmaextractor.results.Result; import liner2.LinerOptions; import liner2.chunker.Chunker; import liner2.chunker.factory.ChunkerFactory; import liner2.chunker.factory.ChunkerManager; import liner2.features.TokenFeatureGenerator; import liner2.structure.*; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; /** * Created with IntelliJ IDEA. * User: michal * Date: 24.03.14 * Time: 10:59 * To change this template use File | Settings | File Templates. */ class PolishProcessor implements LanguageProcessor{ private WCRFT wcrft; private TokenFeatureGenerator featureGenerator; private Chunker namedEntityRecognizer; public PolishProcessor(String wcrftConfig, String wcrftModel, String linerIniPath){ try { wcrft = new WCRFT(wcrftConfig, wcrftModel); } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } LinerOptions linerOpts = new LinerOptions(); linerOpts.loadIni(linerIniPath); featureGenerator = new TokenFeatureGenerator(linerOpts.features); ChunkerManager cm = null; try { cm = ChunkerFactory.loadChunkers(linerOpts); } catch (Exception e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } namedEntityRecognizer = cm.getChunkerByName(linerOpts.getOptionUse()); } public PolishProcessor(String wcrftConfig, String wcrftModel){ try { wcrft = new WCRFT(wcrftConfig, wcrftModel); } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } } @Override public Result processFull(String text) throws Exception { if(namedEntityRecognizer != null){ String taggerOut = wcrft.process(text); Document ps = taggerToLiner(taggerOut); featureGenerator.generateFeatures(ps); HashMap linerOut = namedEntityRecognizer.chunk(ps); Result result = new Result("pl"); int positionInText = 0; int numTokensInPrevSents = 0; for(Sentence s: ps.getSentences()){ for(Token t: s.getTokens()){ String orth = t.getAttributeValue(ps.getAttributeIndex().getIndex("orth")); String base = t.getAttributeValue(ps.getAttributeIndex().getIndex("base")); int token_start = text.indexOf(orth, positionInText); int token_end = token_start + orth.length(); result.addLemma(base, token_start, token_end); positionInText = token_end; } for(Annotation ann: linerOut.get(s).chunkSet()){ ArrayList annTokensIndices = new ArrayList(); for(int i = ann.getBegin() + numTokensInPrevSents; i <= ann.getEnd() + numTokensInPrevSents; i++){ annTokensIndices .add(i); } result.addNamedEntity(ann.getText(), ann.getBaseText(), annTokensIndices); } numTokensInPrevSents += s.getTokenNumber(); } return result; } else{ throw new Exception("Model initialized with tagger only, use processLemmas() instead"); } } @Override public Result processLemmas(String text){ Result result = new Result("pl"); String taggerOut = null; try { taggerOut = wcrft.process(text); } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } int positionInText = 0; for(String tokenStr: taggerOut.split("\n")){ if(!tokenStr.equals("")){ String[] attrs = tokenStr.trim().split("\t"); String orth = attrs[0]; String base = attrs[1]; int token_start = text.indexOf(orth, positionInText); int token_end = token_start + orth.length(); result.addLemma(base, token_start, token_end); positionInText = token_end; } } return result; } private Document taggerToLiner(String taggerOut){ TokenAttributeIndex attrIdx; Document doc; Paragraph p; attrIdx = new TokenAttributeIndex(); // lista zawierająca indeksy atrybutów tokenów attrIdx.addAttribute("orth"); attrIdx.addAttribute("base"); attrIdx.addAttribute("ctag"); doc = new Document("linerInput", attrIdx); p = new Paragraph(""); doc.addParagraph(p); Sentence sent = new Sentence(); for(String tokenStr: taggerOut.split("\n")){ if(tokenStr.equals("")){ p.addSentence(sent); sent = new Sentence(); } else{ String[] attrs = tokenStr.trim().split("\t"); Token token = new Token(attrs[0], new Tag(attrs[1], attrs[2], false), attrIdx); sent.addToken(token); } } p.addSentence(sent); return doc; } }