[fix] slick-router

This commit is contained in:
Andy Bunce 2024-04-03 12:12:32 +01:00
parent 0659567f36
commit dcd18dd3d5
18 changed files with 618 additions and 28 deletions

63
src/lib/bookpages.xqm Normal file
View file

@ -0,0 +1,63 @@
xquery version '3.1';
(:~ describe book page numbers as sequence of ranges, similar to PDF pagelabels
@author quodatum
:)
module namespace bookpages = 'urn:bookpages';
(:~ Invisible-xml grammar to parse custom pagelabel representation :)
declare variable $bookpages:grammar:="
book: pagecount,'#',range,(-',', range)*.
pagecount:['0'-'9']+.
range: s,from?,s,type,s,prefix?,s,offset?.
@from: ['0'-'9']+. { pageIndex }
@type: ['C'|'D'|'R'|'r'|'A'|'a'|'w'].
@prefix: -':',~[',']+.
@offset: -'@',['0'-'9']+.
-s: ([Zs]; #9; #a; #d)*. {Optional whitespace}
";
(:~
page number range in given style
:)
declare function bookpages:span($type as xs:string,$length as xs:integer,$first as xs:integer)
as xs:string*{
let $r:=$first to $first+$length
return switch ($type)
case "D" return $r!format-integer(.,"1")
case "r" return $r!format-integer(.,"i")
case "R" return $r!format-integer(.,"I")
case "C" return "Cover"
default return $r!format-integer(.,$type)
};
(:~ pagelabels from text:)
declare function bookpages:expand($pages as xs:string)
as xs:string*{
let $x:=bookpages:parse($pages)
let $last:=head($x)=>xs:integer()
return hof:until(
function($m){ empty($m?ranges) or count($m?result)eq $last },
function($m){
let $range:=head($m?ranges)=>trace("SS")
let $start:=if($range/@offset)then xs:integer($range/@offset) else 1
let $end:=($m?ranges[2]/xs:integer(@from)-1) otherwise $last
let $length:=$end -count($m?result)-1
let $span:=bookpages:span($range/@type,$length,$start)
let $span:=if($range/@prefix)then $span!concat($range/@prefix,.) else $span
return map {
'ranges': tail($m?ranges),
'result': ($m?result, $span)
}},
(: initial input = grammar ranges :)
map { 'ranges': tail($x) , 'result': () }
)?result
};
(:~ parse pagenumber description to xml :)
declare function bookpages:parse($pages as xs:string)
as element(range)*{
invisible-xml($bookpages:grammar)($pages)/*
};

View file

@ -2,12 +2,12 @@ xquery version '3.1';
(:~
pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library,
requires pdfbox jar on classpath
3.02 required tested with pdfbox-app-3.0.2-20240121.184204-66.jar
3.02+ required tested with pdfbox-app-3.0.2.jar
@see https://repository.apache.org/content/groups/snapshots/org/apache/pdfbox/pdfbox-app/3.0.2-SNAPSHOT/
@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/
:)
module namespace pdfbox="urn:expkg-zone58:pdfbox:3";
module namespace pdfbox="urn:expkg-zone58:pdfbox3";
declare namespace Loader ="java:org.apache.pdfbox.Loader";
declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper";
@ -96,7 +96,7 @@ as map(*)*{
};
(: return bookmark info for children of $outlineItem as seq of maps :)
declare function pdfbox:outline($doc,$outlineItem )
declare function pdfbox:outline($doc as item(),$outlineItem as item()?)
as map(*)*
{
let $find:=hof:until(
@ -143,7 +143,7 @@ as map(*)
}
};
declare function pdfbox:outx($page,$document)
declare function pdfbox:outx($page ,$document)
{
let $currentPage := PDOutlineItem:findDestinationPage($page,$document)
let $pageNumber := pdfbox:pageIndex($currentPage,$document)

View file

@ -1,16 +1,16 @@
xquery version '3.1';
(:~ look for pagenos in pdf text
pagenos:page-report($doc )=>pagenos:inverted-map()
pdfscrape:page-report($doc )=>pdfscrape:inverted-map()
:)
module namespace pagenos = 'urn:pageno';
import module namespace pdfbox="urn:expkg-zone58:pdfbox:3" at "pdfbox3.xqm";
module namespace pdfscrape = 'urn:pdfscrape';
import module namespace pdfbox="urn:expkg-zone58:pdfbox3" at "pdfbox3.xqm";
(: look for possible page number in first/last line of page text
@todo last line and roman
1=Number system ( D=decimal, R=Roman)
2=Side L=left,R=right
:)
declare variable $pagenos:pats:=map{
declare variable $pdfscrape:pats:=map{
"DL": "^([1-9][0-9]*).*",
"DR": ".*[^0-9]([1-9][0-9]*)$",
"RL": "^([ivxlc]+).*",
@ -18,47 +18,56 @@ declare variable $pagenos:pats:=map{
};
(: page-reports for all pages :)
declare function pagenos:page-report($doc as item())
declare function pdfscrape:page-report($doc as item())
as element(page)*{
let $count:=pdfbox:page-count($doc)=>trace("Pages: ")
return (0 to $count -1)!pagenos:page-report($doc,.)
return (1 to $count )!pdfscrape:page-report($doc,.)
};
(: page-report for given page :)
declare function pagenos:page-report($doc as item(), $page as xs:integer)
declare function pdfscrape:page-report($doc as item(), $page as xs:integer)
as element(page){
let $txt:=pdfbox:getText($doc,$page)
let $line1:=substring-before($txt,file:line-separator())
let $fn:=function($acc,$this){ $acc otherwise pagenos:line-report($this,$line1)}
let $found:=map:keys($pagenos:pats)=>fold-left( (),$fn)
let $fn:=function($acc,$this){ $acc otherwise pdfscrape:line-report($this,$line1)}
let $found:=map:keys($pdfscrape:pats)=>fold-left( (),$fn)
return <page index="{ $page }">{ $found, $line1 }</page>
};
(: empty or attributes created by matching $style with $line1 :)
declare function pagenos:line-report($style as xs:string, $line1 as xs:string)
declare function pdfscrape:line-report($style as xs:string, $line1 as xs:string)
as attribute(*)*{
if(matches($line1,$pagenos:pats?($style)))
if(matches($line1,$pdfscrape:pats?($style)))
then (
attribute {"style"} { substring($style,1,1) } ,(: 1st key:)
attribute {"LR"} { substring($style,2,1) } ,(: 2nd key:)
attribute {"number"} { replace($line1,$pagenos:pats?($style),"$1") }
attribute {"number"} { replace($line1,$pdfscrape:pats?($style),"$1") }
)
};
(:~ keys are parsed pageno values are pageindices where found:)
declare function pagenos:inverted-map($pages as element(page)*)
declare function pdfscrape:inverted-map($pages as element(page)*)
as map(*) {
$pages[@number]!map:entry(string(@number),string(@index))
=>map:merge(map{"duplicates":"combine"})
};
(:~ %match
$l page labels
:)
declare function pdfscrape:score($l as xs:string*,$report as element(page)*)
{
let $s:=$report!(if(@number)then string(@number) else "")
let $match:= for-each-pair($l,$s,function($l,$s){if($s eq "")then 0 else if ($s eq $l)then 1 else -1})
return round(sum($match) div count($l) *100,0)
};
(:~ convert roman to integer, zero if invalid
@see https://joewiz.org/2021/05/30/converting-roman-numerals-with-xquery-xslt/
:)
declare function pagenos:decode-roman-numeral($roman-numeral as xs:string)
declare function pdfscrape:decode-roman-numeral($roman-numeral as xs:string)
as xs:integer{
$roman-numeral => upper-case() => pagenos:characters()
$roman-numeral => upper-case() => characters()
=> for-each(map { "M": 1000, "D": 500, "C": 100, "L": 50, "X": 10, "V": 5, "I": 1 })
=> fold-right([0,0], function($number,$accumulator) {
if ($number lt $accumulator?2)
@ -67,8 +76,3 @@ as xs:integer{
=> array:head()
};
(:~ xpath 4:)
declare function pagenos:characters($value as xs:string?)
as xs:string*{
fn:string-to-codepoints($value) ! fn:codepoints-to-string(.)
};