xquery version '3.1'; (:~ look for pagenos in pdf text pdfscrape:page-report($doc )=>pdfscrape:inverted-map() :) module namespace pdfscrape = 'urn:pdfscrape'; import module namespace pdfbox="org.expkg_zone58.Pdfbox3" ; (:~ page number regex @todo last line and roman 1=Number system ( D=decimal, R=Roman) 2=Side L=left,R=right :) declare %private variable $pdfscrape:pats:=map{ "DL": "^([1-9][0-9]*).*", "DR": ".*[^0-9]([1-9][0-9]*)$", "RL": "^([ivxlc]+).*", "RR": ".*[^ivxlc]([ivxlc]+)$" }; (:~ page-reports for all pages :) declare function pdfscrape:page-report($doc as item()) as element(page)*{ let $count:=pdfbox:page-count($doc)=>trace("Pages: ") return (1 to $count )!pdfscrape:page-report($doc,.) }; (:~ page-report for given page :) declare function pdfscrape:page-report($doc as item(), $page as xs:integer) as element(page){ let $txt:=pdfbox:getText($doc,$page) let $line1:=substring-before($txt,file:line-separator()) let $fn:=function($acc,$this){ $acc otherwise pdfscrape:line-report($this,$line1)} let $found:=map:keys($pdfscrape:pats)=>fold-left( (),$fn) return { $found, $line1 } }; (:~ attributes created by matching $style with $line1 or empty :) declare function pdfscrape:line-report($style as xs:string, $line1 as xs:string) as attribute(*)*{ if(matches($line1,$pdfscrape:pats?($style))) then ( attribute {"style"} { substring($style,1,1) } ,(: 1st key:) attribute {"LR"} { substring($style,2,1) } ,(: 2nd key:) attribute {"number"} { replace($line1,$pdfscrape:pats?($style),"$1") } ) }; (:~ keys are parsed pageno values are pageindices where found:) declare function pdfscrape:inverted-map($pages as element(page)*) as map(*) { $pages[@number]!map:entry(string(@number),string(@index)) =>map:merge(map{"duplicates":"combine"}) }; (:~ %match $l page labels :) declare function pdfscrape:score($l as xs:string*, $report as element(page)*) { let $s:=$report!(if(@number)then string(@number) else "") let $match:= for-each-pair($l,$s,function($l,$s){if($s eq "")then 0 else if ($s eq $l)then 1 else -1}) return round(sum($match) div count($l) *100,0) }; (:~ convert roman to integer, zero if invalid @see https://joewiz.org/2021/05/30/converting-roman-numerals-with-xquery-xslt/ :) declare function pdfscrape:decode-roman-numeral($roman-numeral as xs:string) as xs:integer{ $roman-numeral => upper-case() => characters() => for-each(map { "M": 1000, "D": 500, "C": 100, "L": 50, "X": 10, "V": 5, "I": 1 }) => fold-right([0,0], function($number,$accumulator) { if ($number lt $accumulator?2) then [ $accumulator?1 - $number, $number ] else [ $accumulator?1 + $number, $number ] } ) => array:head() }; declare function pdfscrape:characters($str as xs:string) { };