From a0cfa6d937d74f48e032fa1cfc15c366decf6c00 Mon Sep 17 00:00:00 2001 From: Andy Bunce Date: Thu, 30 Jan 2025 16:57:25 +0000 Subject: [PATCH] [mod] download --- scripts/build.xqm | 22 ++++++++-- scripts/download.xq | 17 -------- scripts/make-fat-jar.xq | 7 +++ src/Pdfbox3.xqm | 96 +++++++++++++++++++++++------------------ src/test/test.xqm | 48 +++++++++++++-------- 5 files changed, 110 insertions(+), 80 deletions(-) delete mode 100644 scripts/download.xq diff --git a/scripts/build.xqm b/scripts/build.xqm index 9cfa38b..4e53ba3 100644 --- a/scripts/build.xqm +++ b/scripts/build.xqm @@ -1,7 +1,9 @@ (:~ build utils for REPO packaging :) module namespace build = 'urn:quodatum:build1'; -(:~ create a flat fat jar :) +(:~ create a flat fat jar from jars in $input-dir +keeping only META-INF from $manifest-jar +:) declare function build:fatjar-from-folder($input-dir as xs:string,$manifest-jar as xs:string) as xs:base64Binary { let $fold := @@ -13,13 +15,16 @@ function ($res as map (*), $jar as xs:string) { map { "name" : ($res? name, $paths), "content" : ($res? content,archive:extract-binary($bin, $paths)) } } -let $res := fold-left(file:list($input-dir, false(), "*.jar"), map { }, $fold) +let $res := file:list($input-dir, false(), "*.jar") + =>fold-left( map { }, $fold) return archive:create($res? name, $res? content, map { "format" : "zip", "algorithm" : "deflate" }) }; -(:~ create a fat jar with lib :) +(:~ create a fat jar with lib +@remark +:) declare function build:fatjar-with-lib($input-dir as xs:string,$manifest-jar as xs:string) { let $bin :=file:read-binary($input-dir || $manifest-jar) @@ -49,3 +54,14 @@ declare function build:update($jar as xs:base64Binary,$name as xs:string,$file as xs:base64Binary{ archive:update($jar,$name,$file) }; + + +(:~ download $files from $urls to $destdir:) +declare variable $build:REPO as xs:string external :="https://repo1.maven.org/maven2/"; +declare function build:maven-download($urls as xs:string*,$destdir as xs:string) +as empty-sequence(){ +for $f in $urls +let $dest:=$destdir || replace($f,"^.*/","") +where not(file:exists($dest)) +return file:write-binary($dest, fetch:binary(resolve-uri($f,$build:REPO)=>trace("Download: "))) +}; \ No newline at end of file diff --git a/scripts/download.xq b/scripts/download.xq deleted file mode 100644 index 3004e5f..0000000 --- a/scripts/download.xq +++ /dev/null @@ -1,17 +0,0 @@ - -import module namespace build = 'urn:quodatum:build1' at 'build.xqm'; - -declare variable $files := ( -"https://repo1.maven.org/maven2/org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar", -"https://repo1.maven.org/maven2/org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar", -"https://repo1.maven.org/maven2/org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar", -"https://repo1.maven.org/maven2/commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar" -); - - - -let $base:= file:resolve-path("../",static-base-uri()) -let $target:=file:resolve-path("jars/",$base) -for $f in $files -let $n:=replace($f,"^.*/","") =>trace("N") -return file:write-binary($target || $n, fetch:binary($f)) \ No newline at end of file diff --git a/scripts/make-fat-jar.xq b/scripts/make-fat-jar.xq index 51b8b16..a1cb737 100644 --- a/scripts/make-fat-jar.xq +++ b/scripts/make-fat-jar.xq @@ -1,6 +1,12 @@ import module namespace build = 'urn:quodatum:build1' at 'build.xqm'; +declare variable $urls := ( +"org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar", +"org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar", +"org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar", +"commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar" +); (: Main execution Main-Class: org.basex.modules.Hello :) @@ -12,6 +18,7 @@ let $config :=map { "main-class": "org.expkg_zone58.Pdfbox3" } let $jar-path:=file:resolve-path($config?input-dir,$config?base=>trace("base "))=>trace("jar: ") +let $_:=build:maven-download($urls,$jar-path) let $fat-jar := build:fatjar-from-folder($jar-path,$config?manifest-jar) let $fat-jar:=build:update-manifest($fat-jar, $config?main-class) diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index a14a01a..a2a9315 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -49,10 +49,12 @@ as item(){ Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath)) }; -(:~ the version of the PDF specification used by $pdf :) -declare function pdfbox:pdfVersion($pdf as item()) -as xs:float{ - PDDocument:getVersion($pdf) +(:~ the version of the PDF specification used by $pdf e.g "1.4" +returned as string to avoid rounding issues + :) +declare function pdfbox:specification($pdf as item()) +as xs:string{ + PDDocument:getVersion($pdf)=>string() }; (:~ save pdf $pdf to $savepath , returns $savepath :) @@ -97,45 +99,49 @@ as xs:string{ Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() }; -(:~ outline for $doc as map()* :) -declare function pdfbox:outline($doc as item()) +(:~ outline for $pdf as map()* :) +declare function pdfbox:outline($pdf as item()) as map(*)*{ (# db:wrapjava some #) { let $outline:= - PDDocument:getDocumentCatalog($doc) + PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getDocumentOutline() return if(exists($outline)) - then pdfbox:outline($doc,PDOutlineItem:getFirstChild($outline)) + then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) } }; (:~ return bookmark info for children of $outlineItem as seq of maps :) -declare function pdfbox:outline($doc as item(),$outlineItem as item()?) +declare function pdfbox:outline($pdf as item(),$outlineItem as item()?) as map(*)*{ - let $find as map(*):=pdfbox:_outline($doc ,$outlineItem) + let $find as map(*):=pdfbox:_outline($pdf ,$outlineItem) return map:get($find,"list") }; (: BaseX bug 10.7? error if inlined in outline :) -declare %private function pdfbox:_outline($doc as item(),$outlineItem as item()?) +declare %private function pdfbox:_outline($pdf as item(),$outlineItem as item()?) as map(*){ - hof:until( - function($output) { empty($output?this) }, - function($input ) { - let $bk:= pdfbox:bookmark($input?this,$doc) + pdfbox:do-until( + + map{"list":(),"this":$outlineItem}, + + function($input ) { + let $bk:= pdfbox:bookmark($input?this,$pdf) let $bk:= if($bk?hasChildren) - then let $kids:=pdfbox:outline($doc,PDOutlineItem:getFirstChild($input?this)) + then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) return map:merge(($bk,map:entry("children",$kids))) else $bk return map{ "list": ($input?list, $bk), "this": PDOutlineItem:getNextSibling($input?this)} }, - map{"list":(),"this":$outlineItem} - ) + + function($output) { empty($output?this) } + ) }; + (:~ outline as xml :) declare function pdfbox:outline-xml($outline as map(*)*) as element(outline){ @@ -156,27 +162,22 @@ as element(bookmark)* (:~ return bookmark info for children of $outlineItem @return map like{index:,title:,hasChildren:} :) -declare function pdfbox:bookmark($bookmark as item(),$doc as item()) +declare function pdfbox:bookmark($bookmark as item(),$pdf as item()) as map(*) { map{ - "index": PDOutlineItem:findDestinationPage($bookmark,$doc)=>pdfbox:pageIndex($doc), + "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:page-index($pdf), "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}=>translate("�",""), "hasChildren": PDOutlineItem:hasChildren($bookmark) } }; -declare function pdfbox:outx($page ,$document) -{ - let $currentPage := PDOutlineItem:findDestinationPage($page,$document) - let $pageNumber := pdfbox:pageIndex($currentPage,$document) - return $pageNumber -}; + (:~ pageIndex of $page in $pdf :) -declare function pdfbox:pageIndex( +declare function pdfbox:page-index( $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), - $pdf) + $pdf as item()) as item()? { if(exists($page)) @@ -198,28 +199,20 @@ as xs:string }; -(:~ pageLabel info +(:~ pageLabel for every page @see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples @see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files :) -declare function pdfbox:getPageLabels($pdf as item()) -as item() -{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() -}; - -(:~ pageLabel for every page:) -declare function pdfbox:pageLabels($doc as item()) +declare function pdfbox:labels($pdf as item()) as xs:string* { - PDDocument:getDocumentCatalog($doc) + PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() =>PDPageLabels:getLabelsByPageIndices() }; (:~ return text on $pageNo :) -declare function pdfbox:getText($doc as item(), $pageNo as xs:integer) +declare function pdfbox:page-text($doc as item(), $pageNo as xs:integer) as xs:string{ let $tStripper := (# db:wrapjava instance #) { PDFTextStripper:new() @@ -246,9 +239,9 @@ as map(*){ @param $scale 1=72 dpi @return Java java.awt.image.BufferedImage object :) -declare function pdfbox:pageBufferedImage($doc as item(), $pageNo as xs:integer,$scale as xs:float) +declare function pdfbox:pageBufferedImage($pdf as item(), $pageNo as xs:integer,$scale as xs:float) as item(){ - PDFRenderer:new($doc)=>PDFRenderer:renderImage($pageNo,$scale) + PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$scale) }; (:~ save bufferedimage to $dest @@ -266,4 +259,21 @@ as xs:base64Binary{ let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage , $type, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() -}; \ No newline at end of file +}; + +(:~ fn:do-until shim for BaseX 9+ :) +declare function pdfbox:do-until( + $input as item()*, + $action as function(item()*, xs:integer) as item()*, + $predicate as function(item()*, xs:integer) as xs:boolean? +) as item()* +{ + let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3) + return if($fn) + then $fn($input,$action,$predicate) + else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3) + return if($hof) + then $hof($predicate,$action,$input) + else error(xs:QName('pdfbox:do-until'),"No implementation found") + +}; diff --git a/src/test/test.xqm b/src/test/test.xqm index bfd1f71..7b0958b 100644 --- a/src/test/test.xqm +++ b/src/test/test.xqm @@ -7,39 +7,52 @@ import module namespace pdfbox="org.expkg_zone58.Pdfbox3"; declare variable $test:base:=file:base-dir()=>file:parent()=>file:parent(); +declare %unit:test +function test:pdfbox-version(){ + let $v:= pdfbox:version()=>trace("VER: ") + return unit:assert-equals($v,"3.0.4") +}; + +declare %unit:test +function test:specification(){ + let $pdf:=test:pdf("samples.pdf/BaseX100.pdf") + let $spec:=pdfbox:specification($pdf) + return unit:assert-equals($spec,0+1.4) +}; declare %unit:test function test:page-count(){ - let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve() - let $pages:=pdfbox:open($PDF)=>pdfbox:page-count() + let $pdf:=test:pdf("samples.pdf/BaseX100.pdf") + let $pages:=pdfbox:page-count($pdf) return unit:assert-equals($pages,521) }; declare %unit:test function test:outline-none(){ - let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve() - let $outline:=pdfbox:open($PDF)=>pdfbox:outline() +let $pdf:=test:pdf("samples.pdf/BaseX100.pdf") + let $outline:=pdfbox:outline($pdf) return unit:assert(empty($outline)) }; declare %unit:test function test:outline-present(){ - let $PDF:="samples.pdf/icelandic-dictionary.pdf"=>test:resolve() - let $outline:=pdfbox:open($PDF)=>pdfbox:outline() + let $pdf:=test:pdf("samples.pdf/icelandic-dictionary.pdf") + let $outline:=pdfbox:outline($pdf) return unit:assert(exists($outline)) }; declare %unit:test function test:outline-xml(){ - let $PDF:="samples.pdf/icelandic-dictionary.pdf"=>test:resolve() - let $outline:=pdfbox:open($PDF)=>pdfbox:outline()=>pdfbox:outline-xml() + let $pdf:=test:pdf("samples.pdf/icelandic-dictionary.pdf") + let $outline:=pdfbox:outline($pdf)=>pdfbox:outline-xml() return unit:assert-equals(count($outline/bookmark),31) }; declare %unit:test function test:pagelabels(){ - let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve() - let $labels:=pdfbox:open($PDF)=>pdfbox:pageLabels() + let $pdf:=test:pdf("samples.pdf/BaseX100.pdf") + + let $labels:=pdfbox:labels($pdf) return ( unit:assert($labels[1]="i") , unit:assert($labels[27]="1") @@ -47,20 +60,21 @@ function test:pagelabels(){ }; declare %unit:test -function test:save(){ +function test:extract-save(){ + let $pdf:=test:pdf("samples.pdf/BaseX100.pdf") let $dest:=file:create-temp-file("test",".pdf")=>trace("DEST: ") - let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve() - let $outline:=pdfbox:open($PDF)=>pdfbox:extract(2,12,$dest) + let $outline:=pdfbox:extract($pdf,2,12,$dest) return unit:assert(true()) }; declare %unit:test function test:page-text(){ - let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve() - let $text:=pdfbox:open($PDF)=>pdfbox:getText(1) +let $pdf:=test:pdf("samples.pdf/BaseX100.pdf") + let $text:=pdfbox:page-text($pdf,1) return unit:assert(starts-with($text,"BaseX Documentation")) }; -declare function test:resolve($file as xs:string){ - file:resolve-path($file,$test:base) +declare function test:pdf($file as xs:string) +as item(){ + file:resolve-path($file,$test:base)=>pdfbox:open() };