package XML::TMX::Reader; use 5.004; use warnings; use strict; use Exporter (); use vars qw($VERSION @ISA @EXPORT_OK); use XML::DT; use XML::TMX::Writer; $VERSION = '0.25'; @ISA = 'Exporter'; @EXPORT_OK = qw(); =encoding utf-8 =head1 NAME XML::TMX::Reader - Perl extension for reading TMX files =head1 SYNOPSIS use XML::TMX::Reader; my $reader = XML::TMX::Reader->new( $filename ); $reader -> for_tu( sub { my $tu = shift; #blah blah blah }); @used_languages = $reader->languages; $reader->to_html() =head1 DESCRIPTION This module provides a simple way for reading TMX files. =head1 METHODS The following methods are available: =head2 C This method creates a new XML::TMX::Reader object. This process checks for the existence of the file and extracts some meta-information from the TMX header; my $reader = XML::TMX::Reader->new("my.tmx"); =cut sub new { my ($class, $file) = @_; return undef unless -f $file; my $self = bless { encoding => _guess_encoding($file), filename => $file, ignore_markup => 1, } => $class; $self->_parse_header; return $self; } sub _guess_encoding { my $file = shift; my $encoding = 'UTF-8'; open my $fh, "<", $file or die "can't open $file"; my $line = <$fh>; if ($line =~ /encoding=['"]([^'"]+)['"]/) { $encoding = $1; } close $fh; return $encoding; } sub _parse_header { my $self = shift; my $header = ""; { local $/ = ""; open my $fh, "<:encoding($self->{encoding})", $self->{filename} or die "$!"; $header = <$fh>; close $fh; } $header =~ s/^.*().*$/$1/s; $header =~ s!().*$!$1!s; dtstring($header => ( 'header' => sub { $self->{header}{$_} = $v{$_} for (keys %v); }, 'prop' => sub { $v{type} ||= "_"; push @{$self->{header}{-prop}{$v{type}}}, $c; }, 'note' => sub { push @{$self->{header}{-note}}, $c; }, )); } =head2 C This method is used to set the flag to ignore (or not) markup inside translation unit segments. The default is to ignore those markup. If called without parameters, it sets the flag to ignore the markup. If you don't want to do that, use $reader->ignore_markup(0); =cut sub ignore_markup { my ($self, $opt) = @_; $opt = 1 unless defined $opt; $self->{ignore_markup} = $opt; } =head2 C This method returns the languages being used on the specified translation memory. Note that the module does not check for language code correctness or existence. =cut sub languages { my $self = shift; my %languages = (); $self->for_tu({proc_tu => 100}, sub { my $tu = shift; for ( keys %$tu ) { $languages{$_}++ unless m/^-/; } } ); return keys %languages; } =head2 C Use C to process all translation units from a TMX file. This version iterates for all tu (one at the time) The configuration hash is a reference to a Perl hash. At the moment these are valid options: =over =item C<-verbose> Set this option to a true value and a counter of the number of processed translation units will be printed to stderr. =item C<-output> | C Filename to output the changed TMX to. Note that if you use this option, your function should return a hash reference where keys are language names, and values their respective translation. =item C Write at most C TUs =item C Process at most C TUs =item C Only process TU that match C. =item C<-raw> Pass the XML directly to the method instead of parsing it. =item C<-verbatim> Use segment contents verbatim, without any normalization. =back The function will receive two arguments: =over =item * a reference to a hash which maps: the language codes to the respective translation unit segment; a special key "-prop" that maps property names to properties; a special key "-note" that maps to a list of notes. =item * a reference to a hash which contains the attributes for those translation unit tag; =back If you want to process the TMX and return it again, your function should return an hash reference where keys are the languages, and values their respective translation. =cut sub for_tu { my $self = shift; my $conf = { -header => 1 }; ref($_[0]) eq "HASH" and $conf = {%$conf , %{shift(@_)}}; my $code = shift; die "invalid processor" unless ref($code) eq "CODE"; local $/; my $outputingTMX = 0; my $tmx; my $data; my $gen=0; my %h = ( -type => { tu => 'SEQ', tuv => 'SEQ' }, tu => sub { my $tu; for my $va (@$c) { if ($va->[0] eq "-prop") { push @{$tu->{$va->[0]}{$va->[1]}}, $va->[2] } elsif ($va->[0] eq "-note") { push @{$tu->{$va->[0]}}, $va->[1] } else { $tu->{$va->[0]} = $va->[1] } } my ($ans, $v) = $code->($tu, \%v); # Check if the user wants to create a TMX and # forgot to say us if (!$outputingTMX && $ans && ref($ans) eq "HASH") { $outputingTMX = 1; $tmx = XML::TMX::Writer->new(); if ($conf->{-header}) { my %header = %{$self->{header}}; exists($conf->{-prop}) and $header{-prop} = $conf->{-prop}; exists($conf->{-note}) and $header{-note} = $conf->{-note}; $tmx->start_tmx(encoding => $self->{encoding}, %header); } } # Add the translation unit if ($ans && ref($ans) eq "HASH") { $gen++; %v = %$v if ($v && ref($v) eq "HASH"); my %ans = (%v, %$ans); $tmx->add_tu(-verbatim => $conf->{-verbatim}, %ans); } }, tuv => sub { my $tuv; for my $v (@$c) { if ($v->[0] eq "-prop") { push @{$tuv->{$v->[0]}{$v->[1]}}, $v->[2] } elsif ($v->[0] eq "-note") { push @{$tuv->{$v->[0]}}, $v->[1] } elsif ($v->[0] eq "-cdata") { $tuv->{-iscdata} = 1; $tuv->{-seg} = $v->[1]; } else { $tuv->{-seg} = $v->[0]; } } [ $v{lang} || $v{'xml:lang'} || "_" => $tuv ] }, prop => sub { ["-prop", $v{type} || "_", $c] }, note => sub { ["-note" , $c] }, seg => sub { return ($v{iscdata}) ? [ -cdata => $c ] : [ $c ] }, -cdata => sub { father->{'iscdata'} = 1; $c }, hi => sub { $self->{ignore_markup}?$c:toxml }, ph => sub { $self->{ignore_markup}?$c:toxml }, ); $/ = "\n"; $h{-outputenc} = $h{-inputenc} = $self->{encoding}; my $resto = ""; ## Go through the header... open X, $self->{filename} or die "cannot open file $self->{filename}\n"; while () { if (/^\xFF\xFE/) { die("UTF16 encoding not supported; try 'iconv -f unicode -t utf8 tmx' before\n"); } next if /^\s*$/; last if /)(.*)!s) { $resto = $3; } # If we have an output filename, user wants to output a TMX $conf->{-output} = $conf->{output} if defined($conf->{output}); if (defined($conf->{-output})) { $outputingTMX = 1; $tmx = XML::TMX::Writer->new(); if ($conf->{-header}) { my %header = %{$self->{header}}; exists($conf->{-prop}) and $header{-prop} = $conf->{-prop}; exists($conf->{-note}) and $header{-note} = $conf->{-note}; $tmx->start_tmx(encoding => $self->{encoding}, -output => $conf->{-output}, %header); } } $/ = ""; my $i = 0; $conf->{-verbose}++ if $conf->{verbose}; print STDERR "." if $conf->{-verbose}; while () { ($_ = $resto . $_ and $resto = "" ) if $resto; last if /<\/body>/; $i++; print STDERR "\r$i" if $conf->{-verbose} && !($i % 10); last if defined $conf->{proc_tu} && $i > $conf->{proc_tu} ; last if defined $conf->{gen_tu} && $gen > $conf->{gen_tu}; next if defined $conf->{patt} && !m/$conf->{patt}/ ; s/\>\s+/>/; undef($data); if ($conf->{'-raw'}) { $code->($_); } else { eval { dtstring($_, %h) } ; ## dont die in invalid XML warn $@ if $@; } } print STDERR "\r$i\n" if $conf->{-verbose}; close X; $tmx->end_tmx if $conf->{-header} && $outputingTMX; } =head2 C Use this method to create a nice HTML file with the translation memories. Notice that this method is not finished yet, and relies on some images, on some specific locations. =cut sub to_html { my $self = shift; my %opt = @_; $self->for_tu(sub { my ($langs, $opts) = @_; my $ret = ""; for (keys %$langs) { next if /^-/; $ret .= "\n" } $ret .= "
"; if ($opt{icons}) { $ret .= "\"$_\"/" } else { $ret .= "$_" } $ret .= "$langs->{$_}{-seg}
"; $ret; } ); } sub for_tu2 { warn "Please update your code to use 'for_tu'\n"; &for_tu; } =head2 C deprecated. use C =head1 SEE ALSO L, TMX Specification L =head1 AUTHOR Alberto Simões, Ealbie@alfarrabio.di.uminho.ptE Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE J.João Almeida, Ejj@di.uminho.ptE =head1 COPYRIGHT AND LICENSE Copyright 2003-2012 by Projecto Natura This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1;