package dokumenty.dekompozycja; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Scanner; import dokumenty.Element; import pl.wroc.pwr.file.ImageFolderScanner; import pl.wroc.pwr.image.IRGBImage; import pl.wroc.pwr.image.io.GenericLoader; import pl.wroc.pwr.image.io.GenericSaver; import pl.wroc.pwr.image.model.RGBImage; public class DekompozycjaDokumentuPWR { public String OCRExec = "tesseract"; public String TempFolder = ""; public String ElementExtension = ".result"; public static void main(String[] args) { System.out.println("XDD (PWR variant) build: 5.10.2012"); if (args.length != 5) { System.out.println("Illegal execution arguments!"); System.out.println("Should be: java -Xmx1024m -Xms512m -jar XDD.jar [output file] [region extension] [temp folder] [source folder] [file prefix]"); System.out.println("For example: java -Xmx1024m -Xms512m -jar XDD.jar C:\\NEKST\\Temp\\b001.f001.PWR.xml .expected C:\\NEKST\\Temp\\ C:\\NEKST\\ b001.f001_"); } else { DekompozycjaDokumentuPWR XDD = new DekompozycjaDokumentuPWR(); XDD.TempFolder = args[2]; XDD.OCRExec = "tesseract " + XDD.TempFolder + "temp.png " + XDD.TempFolder + "temp -l pol"; XDD.ElementExtension = args[1]; LinkedList FileList = new LinkedList(); File InFile = new File(args[3]); if (InFile.isFile()) { System.out.println("A folder is required for this operation!"); } else if (InFile.isDirectory()) { System.out.print(" [!] "); for(File CurrentFile : new ImageFolderScanner().apply(InFile)) if (CurrentFile.isFile() && CurrentFile.getName().startsWith(args[4])) FileList.add(CurrentFile.getAbsolutePath()); } Collections.sort(FileList); String[] Files = new String[FileList.size()]; FileList.toArray(Files); XDD.Decompose(Files, args[0]); } } private List HeaderElements; private List TitleElements; private List ContentElements; private List FooterElements; private List OtherElements; public void Decompose(String[] Input, String Output) { try { System.out.println(" [?] Files found: " + Input.length); LinkedList Chunks = new LinkedList(); LinkedList TrailingChunks = new LinkedList(); //Build Chunk List for(int i = 0; i < Input.length; i++) { String File = Input[i]; HeaderElements = new ArrayList(); TitleElements = new ArrayList(); ContentElements = new ArrayList(); FooterElements = new ArrayList(); OtherElements = new ArrayList(); IRGBImage Source = null; System.out.println(" [!] Currently decomposing: " + File); LinkedList CurrentChunks = new LinkedList(); //Load Files System.out.print(" [•] Reading Image... "); Source = new GenericLoader(File).apply(); System.out.println("Ok"); System.out.print(" [•] Reading Elements... "); BufferedReader In = new BufferedReader(new FileReader(File + ElementExtension)); String Line; while ((Line = In.readLine()) != null) { Element New = Element.ParsujLinie(Line); if (HeaderClasses.contains(New.pobierzTyp().toLowerCase())) HeaderElements.add(New); else if (TitleClasses.contains(New.pobierzTyp().toLowerCase())) TitleElements.add(New); else if (ContentClasses.contains(New.pobierzTyp().toLowerCase())) ContentElements.add(New); else if (FooterClasses.contains(New.pobierzTyp().toLowerCase())) FooterElements.add(New); else OtherElements.add(New); } In.close(); System.out.println("Ok!"); System.out.println(" [!] Elements read..."); System.out.println(" [?] Header Elements: " + HeaderElements.size()); System.out.println(" [?] Title Elements: " + TitleElements.size()); System.out.println(" [?] Content Elements: " + ContentElements.size()); System.out.println(" [?] Footer Elements: " + FooterElements.size()); System.out.println(" [?] Other Elements: " + OtherElements.size()); //Order Chuncks TitleElements = SzeregowanieParagrafow.Sort(TitleElements, OtherElements, "header", "line"); for (Element TitleElement : TitleElements) { CurrentChunks.add(DecomposeGenericTextObject(TitleElement, Source)); CurrentChunks.getLast()[0] = "page." + i + "." + CurrentChunks.getLast()[0]; } ContentElements = SzeregowanieParagrafow.Sort(ContentElements, OtherElements, "header", "line"); for (Element ContentElement : ContentElements) { CurrentChunks.add(DecomposeGenericTextObject(ContentElement, Source)); CurrentChunks.getLast()[0] = "page." + i + "." + CurrentChunks.getLast()[0]; } for (Element Caption : OtherElements) if (Caption.pobierzTyp().toLowerCase().equals("caption")) { TrailingChunks.add(DecomposeGenericTextObject(Caption, Source)); TrailingChunks.getLast()[0] = "page." + i + "." + TrailingChunks.getLast()[0]; } //Combine Current Chunk List With Global Chunk List for(String[] Chunk : CurrentChunks) { if (Chunks.size() != 0 && Chunk == CurrentChunks.getFirst()) { String LastChunk = Chunks.getLast()[1].trim(); if (LastChunk.endsWith(".") || LastChunk.endsWith("?") || LastChunk.endsWith("!")) { Chunks.add(Chunk); } else { String[] OldChunk = Chunks.getLast(); OldChunk[0] += ":" + Chunk[0]; OldChunk[1] += " " + Chunk[1]; } } else Chunks.add(Chunk); } } //Save Chunk List To File //Do Header FileWriter Out = new FileWriter(Output); Out.write("\n"); Out.write("\n"); Out.write("\n"); Out.write("\n"); for (String[] Chunk : Chunks) { Out.write("" + Chunk[1] + "\n"); } for (String[] Chunk : TrailingChunks) { Out.write("" + Chunk[1] + "\n"); } //Do Footer Out.write("\n"); Out.write(""); Out.close(); System.out.println("Decomposition finished..."); System.out.println(); } catch (Exception Ex) { Ex.printStackTrace(); return; } } public final List HeaderClasses = new ArrayList(); public final List TitleClasses = new ArrayList(); public final List ContentClasses = new ArrayList(); public final List FooterClasses = new ArrayList(); public DekompozycjaDokumentuPWR() { HeaderClasses.add("pageheader"); TitleClasses.add("title"); TitleClasses.add("author"); TitleClasses.add("abstract"); ContentClasses.add("paragraph"); ContentClasses.add("header"); ContentClasses.add("enumeration"); FooterClasses.add("pagefooter"); } private String[] DecomposeGenericTextObject(Element E, IRGBImage Source) { String[] Return = new String[2]; Return[0] = BuildID(E); Return[1] = GenericTesseractInvocation(E, Source); return Return; } private String GenericTesseractInvocation(Element E, IRGBImage Source) { System.out.print("Invoking TESSERACT... "); String Return = ""; int MinX = E.pobierzPudelko()[0] - 10; if (MinX < 0) MinX = 0; int MinY = E.pobierzPudelko()[1] - 10; if (MinY < 0) MinY = 0; int MaxX = E.pobierzPudelko()[2] + 10; if (MaxX >= Source.getXSize()) MaxX = Source.getXSize() - 1; int MaxY = E.pobierzPudelko()[3] + 10; if (MaxY >= Source.getYSize()) MaxY = Source.getYSize() - 1; IRGBImage Temp = new RGBImage((MaxX - MinX) + 1, (MaxY - MinY) + 1); for (int C = 0; C < 3; C++) for (int x = 0; x < Temp.getXSize(); x++) for (int y = 0; y < Temp.getYSize(); y++) Temp.getChannel(C).setValue(x, y, Source.getChannel(C).getValue(x + MinX, y + MinY)); new GenericSaver(Temp, new File(TempFolder + "temp.png")).apply(); try { Process OCR = Runtime.getRuntime().exec(OCRExec); OCR.waitFor(); Scanner ReadFile = new Scanner(new FileReader(TempFolder + "temp.txt")); while (ReadFile.hasNextLine()) { String CurrentLine = ReadFile.nextLine().trim(); if (CurrentLine.endsWith("-")) CurrentLine = CurrentLine.substring(0, CurrentLine.length() - 1); else CurrentLine += " "; Return += CurrentLine; } System.out.println("Ok!"); } catch (Exception Ex) { System.out.println("TESSERACT OR IO ERROR!"); Return = "TESSERACT OR IO ERROR!"; } Return.replaceAll("<", "<"); Return.replaceAll(">", ">"); Return.replaceAll("&", "&"); Return.replaceAll("\"", """); Return.replaceAll("'", "'"); return Return.trim(); } public String BuildID(Element E) { return E.pobierzTyp().toLowerCase() + "." + E.pobierzPudelko()[0] + "." + E.pobierzPudelko()[1]; } }