X-Git-Url: https://git.stg.codes/stg.git/blobdiff_plain/0c9c28efcd43f53ac54aa60b2dfefa69c70dbadf..6b6d9b29e9e9e91f79507a8bf193fb30de311dcc:/doc/help/xslt/fo/pdf2index diff --git a/doc/help/xslt/fo/pdf2index b/doc/help/xslt/fo/pdf2index new file mode 100755 index 00000000..c14d8ecd --- /dev/null +++ b/doc/help/xslt/fo/pdf2index @@ -0,0 +1,140 @@ +#!/usr/bin/perl -- # -*- Perl -*- + +# this needs some cleanup... + +my $PSTOTEXT = "pstotext"; + +my $pdf = shift @ARGV; + +my $index = ""; +my $inindex = 0; +open (F, "$PSTOTEXT $pdf |"); +while () { + if (/^<\/index/) { + $index .= $_; + $inindex = 0; + } + $inindex = 1 if /^.*?<\/phrase>\s*)+)/s) { + $cindex .= $1; + $_ = $2; + $index = $'; # ' + + my @pages = m/.*?<\/phrase>\s*/sg; + + # Expand ranges + if ($#pages >= 0) { + my @mpages = (); + foreach my $page (@pages) { + my $pageno = &pageno($page); + if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky - + for (my $count = $1; $count <= $2; $count++) { + push (@mpages, "$count"); + } + } else { + push (@mpages, $page); + } + } + @pages = sort rangesort @mpages; + } + + # Remove duplicates... + if ($#pages > 0) { + my @mpages = (); + my $current = ""; + foreach my $page (@pages) { + my $pageno = &pageno($page); + if ($pageno ne $current) { + push (@mpages, $page); + $current = $pageno; + } + } + @pages = @mpages; + } + + # Collapse ranges... + if ($#pages > 1) { + my @cpages = (); + while (@pages) { + my $count = 0; + my $len = &rangelen($count, @pages); + if ($len <= 2) { + my $page = shift @pages; + push (@cpages, $page); + } else { + my $fpage = shift @pages; + my $lpage = ""; + while ($len > 1) { + $lpage = shift @pages; + $len--; + } + my $fpno = &pageno($fpage); + my $lpno = &pageno($lpage); + $fpage =~ s/>$fpno${fpno}-$lpno//; + $page =~ s/^//; + + return $1 if $page =~ /^([^<>]+)/; + return "?"; +} + +sub rangesort { + my $apno = &pageno($a); + my $bpno = &pageno($b); + + # Make sure roman pages come before arabic ones, otherwise sort them in order + return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/); + return 1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/); + return $apno <=> $bpno; +} + +sub rangelen { + my $count = shift; + my @pages = @_; + my $len = 1; + my $inrange = 1; + + my $current = &pageno($pages[$count]); + while ($count < $#pages && $inrange) { + $count++; + my $next = &pageno($pages[$count]); + if ($current + 1 eq $next) { + $current = $next; + $inrange = 1; + $len++; + } else { + $inrange = 0; + } + } + + return $len; +}