package dokumenty.dekompozycja; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Scanner; import pl.wroc.pwr.file.ImageFolderScanner; import pl.wroc.pwr.image.IRGBImage; import pl.wroc.pwr.image.io.GenericLoader; import pl.wroc.pwr.image.io.GenericSaver; import pl.wroc.pwr.image.model.RGBImage; import dokumenty.Element; public class DekompozycjaDokumentuTEI { public String OCRExec = "tesseract"; public String TempFolder = ""; public String ElementExtension = ".result"; public static void main(String[] args) { System.out.println("XDT build: 24.07.2012"); if (args.length != 3) { System.out.println("Illegal execution arguments!"); System.out.println("Should be: java -Xmx1024m -Xms512m -jar XDD.jar [file or folder path] [region extension] [temp folder]"); System.out.println("For example: java -Xxm1024m -Xms512m -jar XDD.jar C:\\NEKST\\Temp\\b001.f001_02.png .expected C:\\NEKST\\"); } else { DekompozycjaDokumentuTEI XDD = new DekompozycjaDokumentuTEI(); XDD.TempFolder = args[2]; XDD.OCRExec = "tesseract " + XDD.TempFolder + "temp.png " + XDD.TempFolder + "temp -l pol"; XDD.ElementExtension = args[1]; File InFile = new File(args[0]); if (InFile.isFile()) { String Input = args[0]; XDD.Decompose(Input, Input + ".decomp.tei.xml"); } else if (InFile.isDirectory()) { int N = 0; List Images = new ImageFolderScanner().apply(InFile); System.out.println(); for (File CurrentFile : Images) { N++; System.out.println("File " + N + " out of " + Images.size()); XDD.Decompose(CurrentFile.getAbsolutePath(), CurrentFile.getAbsolutePath() + ".decomp.xml"); } } } } public final List HeaderClasses = new ArrayList(); public final List TitleClasses = new ArrayList(); public final List ContentClasses = new ArrayList(); public final List FooterClasses = new ArrayList(); public DekompozycjaDokumentuTEI() { HeaderClasses.add("pageheader"); TitleClasses.add("title"); TitleClasses.add("author"); TitleClasses.add("abstract"); ContentClasses.add("paragraph"); ContentClasses.add("header"); ContentClasses.add("enumeration"); ContentClasses.add("mathregion"); FooterClasses.add("pagefooter"); } private List HeaderElements; private List TitleElements; private List ContentElements; private List FooterElements; private List OtherElements; public void Decompose(String Input, String Output) { HeaderElements = new ArrayList(); TitleElements = new ArrayList(); ContentElements = new ArrayList(); FooterElements = new ArrayList(); OtherElements = new ArrayList(); IRGBImage Source = null; System.out.println("Begining decomposition of: " + Input); System.out.println("Will save to: " + Output); try { System.out.print("Reading Image... "); Source = new GenericLoader(Input).apply(); System.out.println("Ok!"); System.out.print("Reading Elements... "); BufferedReader In = new BufferedReader(new FileReader(Input.replaceAll(".jpg", ".png") + ElementExtension)); String Line; while ((Line = In.readLine()) != null) { Element New = Element.ParsujLinie(Line); if (HeaderClasses.contains(New.pobierzTyp().toLowerCase())) HeaderElements.add(New); else if (TitleClasses.contains(New.pobierzTyp().toLowerCase())) TitleElements.add(New); else if (ContentClasses.contains(New.pobierzTyp().toLowerCase())) ContentElements.add(New); else if (FooterClasses.contains(New.pobierzTyp().toLowerCase())) FooterElements.add(New); else OtherElements.add(New); } In.close(); System.out.println("Ok!"); System.out.println(" [!] Elements read..."); System.out.println(" [?] Header Elements: " + HeaderElements.size()); System.out.println(" [?] Title Elements: " + TitleElements.size()); System.out.println(" [?] Content Elements: " + ContentElements.size()); System.out.println(" [?] Footer Elements: " + FooterElements.size()); System.out.println(" [?] Other Elements: " + OtherElements.size()); System.out.println(); HeaderElements = SzeregowanieParagrafow.Sort(HeaderElements, OtherElements, "header", "line"); TitleElements = SzeregowanieParagrafow.Sort(TitleElements, OtherElements, "header", "line"); ContentElements = SzeregowanieParagrafow.Sort(ContentElements, OtherElements, "header", "line"); FooterElements = SzeregowanieParagrafow.Sort(FooterElements, OtherElements, "header", "line"); FileWriter Out = new FileWriter(Output); Out.write(""); { Out.write("\t\n"); { Out.write("\t\t"); { Out.write("\t\t\t"); for (Element E : HeaderElements) if(E.pobierzTyp().equalsIgnoreCase("title")) { Out.write("\t\t\t\t" + GenericTesseractInvocation(E, Source) + ""); } for (Element E : HeaderElements) if(E.pobierzTyp().equalsIgnoreCase("author")) { Out.write("\t\t\t\t" + GenericTesseractInvocation(E, Source) + ""); } Out.write("\t\t\t"); } Out.write("\t\t"); //Out.write("\t\t" + Input + "\n"); //Out.write("\t\t" + Source.getXSize() + "\n"); //Out.write("\t\t" + Source.getYSize() + "\n"); //Out.write("\t\t" + Input + ElementExtension + "\n"); } Out.write("\t\n"); Out.write("\t\n"); { for (Element E : HeaderElements) Out.write(DecomposeGenericTextObject(E, Source) + "\n"); for (Element E : TitleElements) Out.write(DecomposeGenericTextObject(E, Source) + "\n"); for (Element E : ContentElements) Out.write(DecomposeGenericTextObject(E, Source) + "\n"); for (Element E : FooterElements) Out.write(DecomposeGenericTextObject(E, Source) + "\n"); } { HashMap Captions = GetCaptions(OtherElements); for (Element E : OtherElements) { String Current = ""; if (E.pobierzTyp().toLowerCase().equals("line")) Current = DecomposeObjectLine(E); else if (E.pobierzTyp().toLowerCase().equals("note")) Current = DecomposeGenericTextObject(E, Source); else if (E.pobierzTyp().toLowerCase().equals("chart")) Current = DecomposeObjectChart(E, Source, Captions.get(E)); else if (E.pobierzTyp().toLowerCase().equals("diagram")) Current = DecomposeObjectDiagram(E, Source, Captions.get(E)); else if (E.pobierzTyp().toLowerCase().equals("table")) Current = DecomposeObjectTable(E, Source, Captions.get(E)); else if (E.pobierzTyp().toLowerCase().equals("drawing")) Current = DecomposeObjectDrawing(E, Source, Captions.get(E)); else if (E.pobierzTyp().toLowerCase().equals("photo")) Current = DecomposeObjectPhoto(E, Source, Captions.get(E)); else continue; Out.write(Current + "\n"); } } Out.write("\t\n"); } Out.write(""); Out.close(); } catch (Exception Ex) { Ex.printStackTrace(); return; } System.out.println("Decomposition finished..."); System.out.println(); } private HashMap GetCaptions(List Elements) { HashMap Return = new LinkedHashMap(); List Captions = new ArrayList(); List Captionable = new ArrayList(); for (Element Current : Elements) { if (Current.pobierzTyp().toLowerCase().equals("caption")) Captions.add(Current); else if (Current.pobierzTyp().toLowerCase().equals("chart") || Current.pobierzTyp().toLowerCase().equals("diagram") || Current.pobierzTyp().toLowerCase().equals("table") || Current.pobierzTyp().toLowerCase().equals("photo") || Current.pobierzTyp().toLowerCase().equals("drawing")) Captionable.add(Current); else continue; } for (Element CurrentCaption : Captions) { Element Associated = null; double Distance = Double.POSITIVE_INFINITY; for (Element CurrentCaptionable : Captionable) { double CurrentDistance; if (CurrentCaptionable.pobierzTyp().toLowerCase().equals("table")) { CurrentDistance = GetDistance(CurrentCaption, CurrentCaptionable, true); } else { CurrentDistance = GetDistance(CurrentCaption, CurrentCaptionable, false); } if (Distance > CurrentDistance) { Associated = CurrentCaptionable; Distance = CurrentDistance; } } if (Associated != null) { Captionable.remove(Associated); Return.put(Associated, CurrentCaption); } } return Return; } private double GetDistance(Element Caption, Element Object, Boolean IsTable) { double[] TableW = new double[]{2.0, 1.0, 4.0}; double[] NormalW = new double[]{1.0, 2.0, 4.0}; double[] CurrentW = IsTable ? TableW : NormalW; boolean Found = false; double Return = Double.POSITIVE_INFINITY; double X1 = (Caption.pobierzPudelko()[0] + Caption.pobierzPudelko()[2]) / 2.0; double Y1 = (Caption.pobierzPudelko()[1] + Caption.pobierzPudelko()[3]) / 2.0; double X2 = (Object.pobierzPudelko()[0] + Object.pobierzPudelko()[2]) / 2.0; double Y2 = (Object.pobierzPudelko()[1] + Object.pobierzPudelko()[3]) / 2.0; // Attempt to find nearest box with certain rules... // Start with checking if camption is below OR above image... if (X1 > Object.pobierzPudelko()[0] && X1 < Object.pobierzPudelko()[2]) { Found = true; double TempReturn; TempReturn = Math.abs(Object.pobierzPudelko()[3] - Caption.pobierzPudelko()[1]) * CurrentW[0]; if (TempReturn < Return) Return = TempReturn; TempReturn = Math.abs(Object.pobierzPudelko()[1] - Caption.pobierzPudelko()[3]) * CurrentW[1]; if (TempReturn < Return) Return = TempReturn; } // Continue with checking if caption is to the side of an image... if (Y1 > Object.pobierzPudelko()[1] && Y1 < Object.pobierzPudelko()[3]) { Found = true; double TempReturn; TempReturn = Math.abs(Object.pobierzPudelko()[0] - Caption.pobierzPudelko()[2]) * CurrentW[2]; if (TempReturn < Return) Return = TempReturn; TempReturn = Math.abs(Object.pobierzPudelko()[2] - Caption.pobierzPudelko()[0]) * CurrentW[2]; if (TempReturn < Return) Return = TempReturn; } // Just find nearest box... if (!Found) { double X = X1 - X2; double Y = Y1 - Y2; Return = Math.sqrt((X*X) + (Y*Y)) * 10; } return Return; } private String DecomposeGenericTextObject(Element E, IRGBImage Source) { String Type = E.pobierzTyp(); Type = Type.toLowerCase(); if (Type.compareTo("title") == 0) { String Return = "\t\t

"; Return += GenericTesseractInvocation(E, Source); Return += "

"; return Return; } else if (Type.compareTo("author") == 0) { String Return = "\t\t

"; Return += GenericTesseractInvocation(E, Source); Return += "

"; return Return; } else if (Type.compareTo("pageheader") == 0) { String Return = "\t\t

"; Return += GenericTesseractInvocation(E, Source); Return += "

"; return Return; } else if (Type.compareTo("mathregion") == 0) { String Return = "\t\t"; Return += ""; return Return; } else if (Type.compareTo("pagefooter") == 0) { String Return = "\t\t

"; Return += GenericTesseractInvocation(E, Source); Return += "

"; return Return; } else { if (Type.compareTo("header") == 0) Type = "head"; else Type = "p"; String Return = "\t\t<" + Type + ">"; Return += GenericTesseractInvocation(E, Source); Return += ""; return Return; } } private String GenericTesseractInvocation(Element E, IRGBImage Source) { System.out.print("Invoking TESSERACT... "); String Return = ""; int MinX = E.pobierzPudelko()[0] - 10; if (MinX < 0) MinX = 0; int MinY = E.pobierzPudelko()[1] - 10; if (MinY < 0) MinY = 0; int MaxX = E.pobierzPudelko()[2] + 10; if (MaxX >= Source.getXSize()) MaxX = Source.getXSize() - 1; int MaxY = E.pobierzPudelko()[3] + 10; if (MaxY >= Source.getYSize()) MaxY = Source.getYSize() - 1; IRGBImage Temp = new RGBImage((MaxX - MinX) + 1, (MaxY - MinY) + 1); for (int C = 0; C < 3; C++) for (int x = 0; x < Temp.getXSize(); x++) for (int y = 0; y < Temp.getYSize(); y++) Temp.getChannel(C).setValue(x, y, Source.getChannel(C).getValue(x + MinX, y + MinY)); new GenericSaver(Temp, new File(TempFolder + "temp.png")).apply(); try { Process OCR = Runtime.getRuntime().exec(OCRExec); OCR.waitFor(); Scanner ReadFile = new Scanner(new FileReader(TempFolder + "temp.txt")); while (ReadFile.hasNextLine()) Return += "\n" + ReadFile.nextLine(); System.out.println("Ok!"); } catch (Exception Ex) { System.out.println("TESSERACT OR IO ERROR!"); Return = "TESSERACT OR IO ERROR!"; } return Return; } private String DecomposeObjectLine(Element E) { String Return = "\t\t
"; Return += "\n\t\t\tLine: " + BuildSize(E) + ""; Return += "\n\t\t
"; return Return; } /*private String DecomposeObjectNote(Element E, IImage Source) { String Return = "\t\t"; //Insert content here! Return += ""; return Return; }*/ private String DecomposeObjectChart(Element E, IRGBImage Source, Element C) { String Return = "\t\t
"; //Insert content here! if (C != null) { Return += "\n\t\t\t

"; Return += GenericTesseractInvocation(C, Source); Return += "\n\t\t\t

"; } Return += "\n\t\t
"; return Return; } private String DecomposeObjectDiagram(Element E, IRGBImage Source, Element C) { String Return = "\t\t
"; //Insert content here! if (C != null) { Return += "\n\t\t\t

"; Return += GenericTesseractInvocation(C, Source); Return += "\n\t\t\t

"; } Return += "\n\t\t
"; return Return; } private String DecomposeObjectTable(Element E, IRGBImage Source, Element C) { String Return = "\t\t"; if (C != null) { Return += "\n\t\t\t"; Return += GenericTesseractInvocation(C, Source); Return += "\n\t\t\t"; } //Insert content here! Return += "\n\t\t
"; return Return; } private String DecomposeObjectDrawing(Element E, IRGBImage Source, Element C) { String Return = "\t\t
"; //Insert content here! if (C != null) { Return += "\n\t\t\t

"; Return += GenericTesseractInvocation(C, Source); Return += "\n\t\t\t

"; } Return += "\n\t\t
"; return Return; } private String DecomposeObjectPhoto(Element E, IRGBImage Source, Element C) { String Return = "\t\t
"; //Insert content here! if (C != null) { Return += "\n\t\t\t

"; Return += GenericTesseractInvocation(C, Source); Return += "\n\t\t\t

"; } Return += "\n\t\t
"; return Return; } public String BuildSize(Element E) { String Return = ""; Return += E.pobierzPudelko()[0]; Return += "/n"; Return += E.pobierzPudelko()[1]; Return += "/n"; Return += (E.pobierzPudelko()[2] - E.pobierzPudelko()[0]); Return += "/n"; Return += (E.pobierzPudelko()[3] - E.pobierzPudelko()[1]); return Return; } }