package dokumenty.dekompozycja;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Scanner;
import pl.wroc.pwr.file.ImageFolderScanner;
import pl.wroc.pwr.image.IRGBImage;
import pl.wroc.pwr.image.io.GenericLoader;
import pl.wroc.pwr.image.io.GenericSaver;
import pl.wroc.pwr.image.model.RGBImage;
import dokumenty.Element;
public class DekompozycjaDokumentuTEI
{
public String OCRExec = "tesseract";
public String TempFolder = "";
public String ElementExtension = ".result";
public static void main(String[] args)
{
System.out.println("XDT build: 24.07.2012");
if (args.length != 3)
{
System.out.println("Illegal execution arguments!");
System.out.println("Should be: java -Xmx1024m -Xms512m -jar XDD.jar [file or folder path] [region extension] [temp folder]");
System.out.println("For example: java -Xxm1024m -Xms512m -jar XDD.jar C:\\NEKST\\Temp\\b001.f001_02.png .expected C:\\NEKST\\");
}
else
{
DekompozycjaDokumentuTEI XDD = new DekompozycjaDokumentuTEI();
XDD.TempFolder = args[2];
XDD.OCRExec = "tesseract " + XDD.TempFolder + "temp.png " + XDD.TempFolder + "temp -l pol";
XDD.ElementExtension = args[1];
File InFile = new File(args[0]);
if (InFile.isFile())
{
String Input = args[0];
XDD.Decompose(Input, Input + ".decomp.tei.xml");
}
else if (InFile.isDirectory())
{
int N = 0;
List Images = new ImageFolderScanner().apply(InFile);
System.out.println();
for (File CurrentFile : Images)
{
N++;
System.out.println("File " + N + " out of " + Images.size());
XDD.Decompose(CurrentFile.getAbsolutePath(), CurrentFile.getAbsolutePath() + ".decomp.xml");
}
}
}
}
public final List HeaderClasses = new ArrayList();
public final List TitleClasses = new ArrayList();
public final List ContentClasses = new ArrayList();
public final List FooterClasses = new ArrayList();
public DekompozycjaDokumentuTEI()
{
HeaderClasses.add("pageheader");
TitleClasses.add("title");
TitleClasses.add("author");
TitleClasses.add("abstract");
ContentClasses.add("paragraph");
ContentClasses.add("header");
ContentClasses.add("enumeration");
ContentClasses.add("mathregion");
FooterClasses.add("pagefooter");
}
private List HeaderElements;
private List TitleElements;
private List ContentElements;
private List FooterElements;
private List OtherElements;
public void Decompose(String Input, String Output)
{
HeaderElements = new ArrayList();
TitleElements = new ArrayList();
ContentElements = new ArrayList();
FooterElements = new ArrayList();
OtherElements = new ArrayList();
IRGBImage Source = null;
System.out.println("Begining decomposition of: " + Input);
System.out.println("Will save to: " + Output);
try
{
System.out.print("Reading Image... ");
Source = new GenericLoader(Input).apply();
System.out.println("Ok!");
System.out.print("Reading Elements... ");
BufferedReader In = new BufferedReader(new FileReader(Input.replaceAll(".jpg", ".png") + ElementExtension));
String Line;
while ((Line = In.readLine()) != null)
{
Element New = Element.ParsujLinie(Line);
if (HeaderClasses.contains(New.pobierzTyp().toLowerCase())) HeaderElements.add(New);
else if (TitleClasses.contains(New.pobierzTyp().toLowerCase())) TitleElements.add(New);
else if (ContentClasses.contains(New.pobierzTyp().toLowerCase())) ContentElements.add(New);
else if (FooterClasses.contains(New.pobierzTyp().toLowerCase())) FooterElements.add(New);
else OtherElements.add(New);
}
In.close();
System.out.println("Ok!");
System.out.println(" [!] Elements read...");
System.out.println(" [?] Header Elements: " + HeaderElements.size());
System.out.println(" [?] Title Elements: " + TitleElements.size());
System.out.println(" [?] Content Elements: " + ContentElements.size());
System.out.println(" [?] Footer Elements: " + FooterElements.size());
System.out.println(" [?] Other Elements: " + OtherElements.size());
System.out.println();
HeaderElements = SzeregowanieParagrafow.Sort(HeaderElements, OtherElements, "header", "line");
TitleElements = SzeregowanieParagrafow.Sort(TitleElements, OtherElements, "header", "line");
ContentElements = SzeregowanieParagrafow.Sort(ContentElements, OtherElements, "header", "line");
FooterElements = SzeregowanieParagrafow.Sort(FooterElements, OtherElements, "header", "line");
FileWriter Out = new FileWriter(Output);
Out.write("");
{
Out.write("\t\n");
{
Out.write("\t\t");
{
Out.write("\t\t\t");
for (Element E : HeaderElements) if(E.pobierzTyp().equalsIgnoreCase("title"))
{
Out.write("\t\t\t\t" + GenericTesseractInvocation(E, Source) + "");
}
for (Element E : HeaderElements) if(E.pobierzTyp().equalsIgnoreCase("author"))
{
Out.write("\t\t\t\t" + GenericTesseractInvocation(E, Source) + "");
}
Out.write("\t\t\t");
}
Out.write("\t\t");
//Out.write("\t\t" + Input + "\n");
//Out.write("\t\t" + Source.getXSize() + "\n");
//Out.write("\t\t" + Source.getYSize() + "\n");
//Out.write("\t\t" + Input + ElementExtension + "\n");
}
Out.write("\t\n");
Out.write("\t\n");
{
for (Element E : HeaderElements) Out.write(DecomposeGenericTextObject(E, Source) + "\n");
for (Element E : TitleElements) Out.write(DecomposeGenericTextObject(E, Source) + "\n");
for (Element E : ContentElements) Out.write(DecomposeGenericTextObject(E, Source) + "\n");
for (Element E : FooterElements) Out.write(DecomposeGenericTextObject(E, Source) + "\n");
}
{
HashMap Captions = GetCaptions(OtherElements);
for (Element E : OtherElements)
{
String Current = "";
if (E.pobierzTyp().toLowerCase().equals("line")) Current = DecomposeObjectLine(E);
else if (E.pobierzTyp().toLowerCase().equals("note")) Current = DecomposeGenericTextObject(E, Source);
else if (E.pobierzTyp().toLowerCase().equals("chart")) Current = DecomposeObjectChart(E, Source, Captions.get(E));
else if (E.pobierzTyp().toLowerCase().equals("diagram")) Current = DecomposeObjectDiagram(E, Source, Captions.get(E));
else if (E.pobierzTyp().toLowerCase().equals("table")) Current = DecomposeObjectTable(E, Source, Captions.get(E));
else if (E.pobierzTyp().toLowerCase().equals("drawing")) Current = DecomposeObjectDrawing(E, Source, Captions.get(E));
else if (E.pobierzTyp().toLowerCase().equals("photo")) Current = DecomposeObjectPhoto(E, Source, Captions.get(E));
else continue;
Out.write(Current + "\n");
}
}
Out.write("\t\n");
}
Out.write("");
Out.close();
}
catch (Exception Ex)
{
Ex.printStackTrace();
return;
}
System.out.println("Decomposition finished...");
System.out.println();
}
private HashMap GetCaptions(List Elements)
{
HashMap Return = new LinkedHashMap();
List Captions = new ArrayList();
List Captionable = new ArrayList();
for (Element Current : Elements)
{
if (Current.pobierzTyp().toLowerCase().equals("caption")) Captions.add(Current);
else if (Current.pobierzTyp().toLowerCase().equals("chart") ||
Current.pobierzTyp().toLowerCase().equals("diagram") ||
Current.pobierzTyp().toLowerCase().equals("table") ||
Current.pobierzTyp().toLowerCase().equals("photo") ||
Current.pobierzTyp().toLowerCase().equals("drawing")) Captionable.add(Current);
else continue;
}
for (Element CurrentCaption : Captions)
{
Element Associated = null;
double Distance = Double.POSITIVE_INFINITY;
for (Element CurrentCaptionable : Captionable)
{
double CurrentDistance;
if (CurrentCaptionable.pobierzTyp().toLowerCase().equals("table"))
{
CurrentDistance = GetDistance(CurrentCaption, CurrentCaptionable, true);
}
else
{
CurrentDistance = GetDistance(CurrentCaption, CurrentCaptionable, false);
}
if (Distance > CurrentDistance)
{
Associated = CurrentCaptionable;
Distance = CurrentDistance;
}
}
if (Associated != null)
{
Captionable.remove(Associated);
Return.put(Associated, CurrentCaption);
}
}
return Return;
}
private double GetDistance(Element Caption, Element Object, Boolean IsTable)
{
double[] TableW = new double[]{2.0, 1.0, 4.0};
double[] NormalW = new double[]{1.0, 2.0, 4.0};
double[] CurrentW = IsTable ? TableW : NormalW;
boolean Found = false;
double Return = Double.POSITIVE_INFINITY;
double X1 = (Caption.pobierzPudelko()[0] + Caption.pobierzPudelko()[2]) / 2.0;
double Y1 = (Caption.pobierzPudelko()[1] + Caption.pobierzPudelko()[3]) / 2.0;
double X2 = (Object.pobierzPudelko()[0] + Object.pobierzPudelko()[2]) / 2.0;
double Y2 = (Object.pobierzPudelko()[1] + Object.pobierzPudelko()[3]) / 2.0;
// Attempt to find nearest box with certain rules...
// Start with checking if camption is below OR above image...
if (X1 > Object.pobierzPudelko()[0] && X1 < Object.pobierzPudelko()[2])
{
Found = true;
double TempReturn;
TempReturn = Math.abs(Object.pobierzPudelko()[3] - Caption.pobierzPudelko()[1]) * CurrentW[0];
if (TempReturn < Return) Return = TempReturn;
TempReturn = Math.abs(Object.pobierzPudelko()[1] - Caption.pobierzPudelko()[3]) * CurrentW[1];
if (TempReturn < Return) Return = TempReturn;
}
// Continue with checking if caption is to the side of an image...
if (Y1 > Object.pobierzPudelko()[1] && Y1 < Object.pobierzPudelko()[3])
{
Found = true;
double TempReturn;
TempReturn = Math.abs(Object.pobierzPudelko()[0] - Caption.pobierzPudelko()[2]) * CurrentW[2];
if (TempReturn < Return) Return = TempReturn;
TempReturn = Math.abs(Object.pobierzPudelko()[2] - Caption.pobierzPudelko()[0]) * CurrentW[2];
if (TempReturn < Return) Return = TempReturn;
}
// Just find nearest box...
if (!Found)
{
double X = X1 - X2;
double Y = Y1 - Y2;
Return = Math.sqrt((X*X) + (Y*Y)) * 10;
}
return Return;
}
private String DecomposeGenericTextObject(Element E, IRGBImage Source)
{
String Type = E.pobierzTyp();
Type = Type.toLowerCase();
if (Type.compareTo("title") == 0)
{
String Return = "\t\t";
Return += GenericTesseractInvocation(E, Source);
Return += "
";
return Return;
}
else if (Type.compareTo("author") == 0)
{
String Return = "\t\t";
Return += GenericTesseractInvocation(E, Source);
Return += "
";
return Return;
}
else if (Type.compareTo("pageheader") == 0)
{
String Return = "\t\t";
Return += GenericTesseractInvocation(E, Source);
Return += "
";
return Return;
}
else if (Type.compareTo("mathregion") == 0)
{
String Return = "\t\t";
Return += "";
return Return;
}
else if (Type.compareTo("pagefooter") == 0)
{
String Return = "\t\t";
Return += GenericTesseractInvocation(E, Source);
Return += "
";
return Return;
}
else
{
if (Type.compareTo("header") == 0) Type = "head";
else Type = "p";
String Return = "\t\t<" + Type + ">";
Return += GenericTesseractInvocation(E, Source);
Return += "" + Type + ">";
return Return;
}
}
private String GenericTesseractInvocation(Element E, IRGBImage Source)
{
System.out.print("Invoking TESSERACT... ");
String Return = "";
int MinX = E.pobierzPudelko()[0] - 10;
if (MinX < 0) MinX = 0;
int MinY = E.pobierzPudelko()[1] - 10;
if (MinY < 0) MinY = 0;
int MaxX = E.pobierzPudelko()[2] + 10;
if (MaxX >= Source.getXSize()) MaxX = Source.getXSize() - 1;
int MaxY = E.pobierzPudelko()[3] + 10;
if (MaxY >= Source.getYSize()) MaxY = Source.getYSize() - 1;
IRGBImage Temp = new RGBImage((MaxX - MinX) + 1, (MaxY - MinY) + 1);
for (int C = 0; C < 3; C++)
for (int x = 0; x < Temp.getXSize(); x++)
for (int y = 0; y < Temp.getYSize(); y++)
Temp.getChannel(C).setValue(x, y, Source.getChannel(C).getValue(x + MinX, y + MinY));
new GenericSaver(Temp, new File(TempFolder + "temp.png")).apply();
try
{
Process OCR = Runtime.getRuntime().exec(OCRExec);
OCR.waitFor();
Scanner ReadFile = new Scanner(new FileReader(TempFolder + "temp.txt"));
while (ReadFile.hasNextLine()) Return += "\n" + ReadFile.nextLine();
System.out.println("Ok!");
}
catch (Exception Ex)
{
System.out.println("TESSERACT OR IO ERROR!");
Return = "TESSERACT OR IO ERROR!";
}
return Return;
}
private String DecomposeObjectLine(Element E)
{
String Return = "\t\t";
Return += "\n\t\t\tLine: " + BuildSize(E) + "";
Return += "\n\t\t";
return Return;
}
/*private String DecomposeObjectNote(Element E, IImage Source)
{
String Return = "\t\t";
//Insert content here!
Return += "";
return Return;
}*/
private String DecomposeObjectChart(Element E, IRGBImage Source, Element C)
{
String Return = "\t\t";
//Insert content here!
if (C != null)
{
Return += "\n\t\t\t";
Return += GenericTesseractInvocation(C, Source);
Return += "\n\t\t\t
";
}
Return += "\n\t\t";
return Return;
}
private String DecomposeObjectDiagram(Element E, IRGBImage Source, Element C)
{
String Return = "\t\t";
//Insert content here!
if (C != null)
{
Return += "\n\t\t\t";
Return += GenericTesseractInvocation(C, Source);
Return += "\n\t\t\t
";
}
Return += "\n\t\t";
return Return;
}
private String DecomposeObjectTable(Element E, IRGBImage Source, Element C)
{
String Return = "\t\t";
if (C != null)
{
Return += "\n\t\t\t";
Return += GenericTesseractInvocation(C, Source);
Return += "\n\t\t\t";
}
//Insert content here!
Return += "\n\t\t
";
return Return;
}
private String DecomposeObjectDrawing(Element E, IRGBImage Source, Element C)
{
String Return = "\t\t";
//Insert content here!
if (C != null)
{
Return += "\n\t\t\t";
Return += GenericTesseractInvocation(C, Source);
Return += "\n\t\t\t
";
}
Return += "\n\t\t";
return Return;
}
private String DecomposeObjectPhoto(Element E, IRGBImage Source, Element C)
{
String Return = "\t\t";
//Insert content here!
if (C != null)
{
Return += "\n\t\t\t";
Return += GenericTesseractInvocation(C, Source);
Return += "\n\t\t\t
";
}
Return += "\n\t\t";
return Return;
}
public String BuildSize(Element E)
{
String Return = "";
Return += E.pobierzPudelko()[0];
Return += "/n";
Return += E.pobierzPudelko()[1];
Return += "/n";
Return += (E.pobierzPudelko()[2] - E.pobierzPudelko()[0]);
Return += "/n";
Return += (E.pobierzPudelko()[3] - E.pobierzPudelko()[1]);
return Return;
}
}