package dokumenty.dekompozycja; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import pl.wroc.pwr.file.ImageFolderScanner; import dokumenty.Element; import dokumenty.SerializatorElementow; public class SzeregowanieParagrafow { private static int Num = 0; public static boolean Verbose = false; public static List Sort(List TextElements, List OtherElements, String Header, String Line) { List SortedText = null; if (TextElements.size() == 0) { SortedText = new ArrayList(); } else if (TextElements.size() == 1) { SortedText = TextElements; } else { boolean FoundSolution = false; int[] Bounds = GetBounds(TextElements); //Rule 1: divide by header... /*{ Element CutElement = CutH(TextElements, OtherElements, Bounds, Header, Line); if (CutElement != null) { FoundSolution = true; List Top = new ArrayList(); List Bottom = new ArrayList(); for (Element Current : TextElements) { if (Current != CutElement) { if (Current.pobierzPudelko()[3] <= CutElement.pobierzPudelko()[3]) Top.add(Current); else Bottom.add(Current); } } if (Verbose) System.out.print("\t\t"); if (Verbose) for (int i = 0; i < Num; i++) System.out.print(" "); if (Verbose) System.out.println("HCut: " + Top.size() + "/1/" + Bottom.size() + " with: " + CutElement.pobierzPudelko()[3] + " bound: " + Bounds[3]); Num++; SortedText = Join(Sort(Top, OtherElements, Header, Line), CutElement, Sort(Bottom, OtherElements, Header, Line)); Num--; } }*/ //Rule 2: divide top to bottom... if (!FoundSolution) { Element CutElement = CutX(TextElements, OtherElements, Bounds, Line); if (CutElement != null) { FoundSolution = true; List Left = new ArrayList(); List Right = new ArrayList(); for (Element Current : TextElements) { if (Current.pobierzPudelko()[2] <= CutElement.pobierzPudelko()[2]) Left.add(Current); else Right.add(Current); } if (Verbose) System.out.print("\t\t"); if (Verbose) for (int i = 0; i < Num; i++) System.out.print(" "); if (Verbose) System.out.println("XCut: " + Left.size() + "/" + Right.size() + " with: " + CutElement.pobierzPudelko()[2] + " bound: " + Bounds[2]); Num++; SortedText = Join(Sort(Left, OtherElements, Header, Line), null, Sort(Right, OtherElements, Header, Line)); Num--; } } //Rule 3: divide left to right... if (!FoundSolution) { Element CutElement = CutY(TextElements, OtherElements, Bounds); if (CutElement != null) { FoundSolution = true; List Top = new ArrayList(); List Bottom = new ArrayList(); for (Element Current : TextElements) { if (Current.pobierzPudelko()[3] <= CutElement.pobierzPudelko()[3]) Top.add(Current); else Bottom.add(Current); } if (Verbose) System.out.print("\t\t"); if (Verbose) for (int i = 0; i < Num; i++) System.out.print(" "); if (Verbose) System.out.println("YCut: " + Top.size() + "/" + Bottom.size() + " with: " + CutElement.pobierzPudelko()[3] + " bound: " + Bounds[3]); Num++; SortedText = Join(Sort(Top, OtherElements, Header, Line), null, Sort(Bottom, OtherElements, Header, Line)); Num--; } } //Rule 4: return all acording to top height... if (!FoundSolution) { if (Verbose) System.out.print("\t\t"); if (Verbose) for (int i = 0; i < Num; i++) System.out.print(" "); if (Verbose) System.out.println("HSort: " + TextElements.size()); SortedText = SortByHeight(TextElements); } } return SortedText; } private static List Join(List First, Element Middle, List Second) { List Joined = new ArrayList(); for (Element Current : First) Joined.add(Current); if (Middle != null) Joined.add(Middle); for (Element Current : Second) Joined.add(Current); return Joined; } private static List SortByHeight(List SortThis) { List Sorted = new ArrayList(); Element Current; int CurrentY; while (SortThis.size() > 0) { Current = null; CurrentY = Integer.MAX_VALUE; for (Element E : SortThis) { if (E.pobierzPudelko()[1] < CurrentY) { CurrentY = E.pobierzPudelko()[1]; Current = E; } } SortThis.remove(Current); Sorted.add(Current); } return Sorted; } private static int[] GetBounds(List TextElements) { int[] Return = new int[4]; Return[0] = Integer.MAX_VALUE; Return[1] = Integer.MAX_VALUE; Return[2] = Integer.MIN_VALUE; Return[3] = Integer.MIN_VALUE; for(Element Current : TextElements) { if (Current.pobierzPudelko()[0] < Return[0]) Return[0] = Current.pobierzPudelko()[0]; if (Current.pobierzPudelko()[1] < Return[1]) Return[1] = Current.pobierzPudelko()[1]; if (Current.pobierzPudelko()[2] > Return[2]) Return[2] = Current.pobierzPudelko()[2]; if (Current.pobierzPudelko()[3] > Return[3]) Return[3] = Current.pobierzPudelko()[3]; } return Return; } private static Element CutH(List TextElements, List OtherElements, int[] Bounds, String Header, String Line) { Element Return = null; for(Element Current: TextElements) { if (Current.pobierzTyp().compareToIgnoreCase(Header) == 0) { boolean Found = true; for(Element Try : TextElements) { if (Current.pobierzPudelko()[1] >= Try.pobierzPudelko()[3] && Current.pobierzPudelko()[3] <= Try.pobierzPudelko()[1]) { Found = false; break; } } if (Found) for (Element Try : OtherElements) { if (Bounds[0] > Try.pobierzPudelko()[2] && Bounds[2] < Try.pobierzPudelko()[0] && Bounds[1] > Try.pobierzPudelko()[3] && Bounds[3] < Try.pobierzPudelko()[1] && Current.pobierzPudelko()[1] >= Try.pobierzPudelko()[3] && Current.pobierzPudelko()[3] <= Try.pobierzPudelko()[1] && !Current.pobierzTyp().toLowerCase().equals(Line)) { Found = false; break; } } if (Found) { Return = Current; break; } } } return Return; } private static Element CutX(List TextElements, List OtherElements, int[] Bounds, String Line) { Element Return = null; for(Element Current: TextElements) { int Temp = Current.pobierzPudelko()[2]; if (Temp < Bounds[2]) { boolean Found = true; for(Element Try: TextElements) { if (Temp < Try.pobierzPudelko()[2] && Temp >= Try.pobierzPudelko()[0]) { Found = false; break; } } if (Found) for (Element Try : OtherElements) { if (Current.pobierzTyp().toLowerCase().equals(Line) && Bounds[0] > Try.pobierzPudelko()[2] && Bounds[2] < Try.pobierzPudelko()[0] && Bounds[1] > Try.pobierzPudelko()[3] && Bounds[3] < Try.pobierzPudelko()[1] && Current.pobierzPudelko()[1] >= Try.pobierzPudelko()[3] && Current.pobierzPudelko()[3] <= Try.pobierzPudelko()[1]) { Found = false; break; } } if (Found) { Return = Current; break; } } } return Return; } private static Element CutY(List TextElements, List OtherElements, int[] Bounds) { Element Return = null; for(Element Current: TextElements) { int Temp = Current.pobierzPudelko()[3]; if (Temp < Bounds[3]) { boolean Found = true; for(Element Try : TextElements) { if (Temp < Try.pobierzPudelko()[3] && Temp >= Try.pobierzPudelko()[1]) { Found = false; break; } } if (Found) for (Element Try : OtherElements) { if (Bounds[0] > Try.pobierzPudelko()[2] && Bounds[2] < Try.pobierzPudelko()[0] && Bounds[1] > Try.pobierzPudelko()[3] && Bounds[3] < Try.pobierzPudelko()[1] && Temp < Try.pobierzPudelko()[3] && Temp >= Try.pobierzPudelko()[1]) { Found = false; break; } } if (Found) { Return = Current; break; } } } return Return; } public static void main(String[] args) { Verbose = true; List HeaderClasses = new ArrayList(); List TitleClasses = new ArrayList(); List ContentClasses = new ArrayList(); List FooterClasses = new ArrayList(); HeaderClasses.add("pageheader"); TitleClasses.add("title"); TitleClasses.add("author"); TitleClasses.add("abstract"); ContentClasses.add("paragraph"); ContentClasses.add("header"); ContentClasses.add("enumeration"); ContentClasses.add("mathregion"); FooterClasses.add("pagefooter"); String Source = "C:\\NEKST\\Temp"; List Images = new ImageFolderScanner().apply(new File(Source)); SerializatorElementow ExpectedGetter = new SerializatorElementow(".expected"); SerializatorElementow OrderGetter = new SerializatorElementow(".order"); System.out.println("Files found: " + Images.size()); System.out.println(); String CorrectOrder = "abcdefghijklmnoprstuwyz"; int FilesProcessed = 0; int FilesProcessedCorrectly = 0; int TotalEditDistance = 0; int[] DistanceMatrix = new int[10]; for (File CurrentFile : Images) { System.out.println("\tProcessing file: " + CurrentFile); List ExpectedElements = ExpectedGetter.wczytaj(CurrentFile); List OrderElements = OrderGetter.wczytaj(CurrentFile); List HeaderElements = new LinkedList(); List TitleElements = new LinkedList(); List ContentElements = new LinkedList(); List FooterElements = new LinkedList(); List OtherElements = new LinkedList(); List OrderedContentElements = new LinkedList(); for (Element CurrentElement : ExpectedElements) { if (HeaderClasses.contains(CurrentElement.pobierzTyp().toLowerCase())) HeaderElements.add(CurrentElement); else if (TitleClasses.contains(CurrentElement.pobierzTyp().toLowerCase())) TitleElements.add(CurrentElement); else if (ContentClasses.contains(CurrentElement.pobierzTyp().toLowerCase())) ContentElements.add(CurrentElement); else if (FooterClasses.contains(CurrentElement.pobierzTyp().toLowerCase())) FooterElements.add(CurrentElement); else OtherElements.add(CurrentElement); } for (Element CurrentElement : OrderElements) { if (CurrentElement.pobierzTyp().length() == 1) OrderedContentElements.add(CurrentElement); } if (ExpectedElements.size() == OrderElements.size() && ContentElements.size() == OrderedContentElements.size() && ContentElements.size() > 0) { FilesProcessed++; List Sorted = Sort(ContentElements, OtherElements, "header", "line"); String OrderText = ""; for (Element CurrentElement : Sorted) { double CurrentDistance = Double.POSITIVE_INFINITY; Element CorespondingElement = null; for (Element OrderedElement : OrderedContentElements) { double Distance = (OrderedElement.pobierzPudelko()[0] - CurrentElement.pobierzPudelko()[0]) * (OrderedElement.pobierzPudelko()[0] - CurrentElement.pobierzPudelko()[0]); Distance += (OrderedElement.pobierzPudelko()[1] - CurrentElement.pobierzPudelko()[1]) * (OrderedElement.pobierzPudelko()[1] - CurrentElement.pobierzPudelko()[1]); Distance = Math.sqrt(Distance); if (Distance < CurrentDistance) { CurrentDistance = Distance; CorespondingElement = OrderedElement; } } OrderText += CorespondingElement.pobierzTyp(); } String CurrentCorrectOrder = CorrectOrder.substring(0, OrderText.length()); int EditDistance = LevenshteinDistance.damlev(OrderText, CurrentCorrectOrder); TotalEditDistance += EditDistance; DistanceMatrix[EditDistance]++; System.out.println("\t\tSorted order: " + OrderText); System.out.println("\t\tCorrect order: " + CurrentCorrectOrder); System.out.println("\t\tEdit distance: " + EditDistance); if (EditDistance == 0) FilesProcessedCorrectly++; } else { System.out.println("\t\tSkipping file, element number different for .expected and .order file!"); continue; } } System.out.println(); System.out.println("Files processed: " + FilesProcessed + " out of " + Images.size()); System.out.println("Files processed correctly: " + FilesProcessedCorrectly + " out of " + FilesProcessed); System.out.println("Total edit distance: " + ((double)TotalEditDistance / (double)FilesProcessed)); for (int i = 0; i < 10; i++) if (DistanceMatrix[i] > 0) System.out.println("Edit distance = " + i + ": " + DistanceMatrix[i]); } }