From d37f923d0921a366fd82a5df6046eb083237614f Mon Sep 17 00:00:00 2001 From: Andy Bunce Date: Tue, 4 Feb 2025 20:45:45 +0000 Subject: [PATCH] [mod] lost --- README.md | 6 ++-- package.json | 15 +++++--- package.xml | 17 ++++++++++ scripts/build.xqm | 15 ++++---- scripts/make-xar.xq | 2 +- scripts/maven.xqm | 22 ++++++++++++ src/Pdfbox3.xqm | 68 ++++++++++++++++++++++++------------- src/metadata/expath-pkg.xml | 2 +- 8 files changed, 106 insertions(+), 41 deletions(-) create mode 100644 package.xml create mode 100644 scripts/maven.xqm diff --git a/README.md b/README.md index 1faa1cd..a682f5c 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,11 @@ # Pdfbox A BaseX interface for [Pdfbox](https://pdfbox.apache.org/) version 3. -It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7 +It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7. + +* The Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful. ## Features * read PDF page count. -* read any PDF outline and return as maps or XML. +* read any PDF outline and return as map(s) or XML. * read pagelabels. * read page text. * save pdf page range to a new pdf. diff --git a/package.json b/package.json index 9f002a7..ab537f8 100644 --- a/package.json +++ b/package.json @@ -1,13 +1,15 @@ { "name": "pdfbox", - "version": "1.0.0", + "version": "0.1.1", "description": "A BaseX interface to Apache Pdfbox version 3", - "main": "index.js", + "main": "Pdfbox.xqm", + "homepage": "https://github.com/npm/example#readme", "directories": { "doc": "docs" }, "scripts": { - "test": "%BASEX10%/bin/basex -t src/test" + "test": "%BASEX10%/bin/basex -t src/test", + "docs": "xqdoca" }, "keywords": [ "pdf", @@ -16,5 +18,8 @@ "java" ], "author": "Andy Bunce", - "license": "Apache-2.0" -} + "license": "Apache-2.0", + "quodatum": { + "random": true + } +} \ No newline at end of file diff --git a/package.xml b/package.xml new file mode 100644 index 0000000..d18c01f --- /dev/null +++ b/package.xml @@ -0,0 +1,17 @@ + + + org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar + + + org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar + + + org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar + + + commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar + + \ No newline at end of file diff --git a/scripts/build.xqm b/scripts/build.xqm index 08efe98..d7cf5dd 100644 --- a/scripts/build.xqm +++ b/scripts/build.xqm @@ -4,6 +4,8 @@ module namespace build = 'urn:quodatum:build1'; (:~ create a flat fat jar from jars in $input-dir keeping only META-INF from $manifest-jar :) +declare variable $build:archive-opts:= map { "format" : "zip", "algorithm" : "deflate" }; + declare function build:fatjar-from-folder($input-dir as xs:string,$manifest-jar as xs:string) as xs:base64Binary { let $fold := @@ -17,9 +19,7 @@ function ($res as map (*), $jar as xs:string) { } let $res := file:list($input-dir, false(), "*.jar") =>fold-left( map { }, $fold) -return - archive:create($res? name, $res? content, - map { "format" : "zip", "algorithm" : "deflate" }) +return archive:create($res? name, $res? content,$build:archive-opts) }; (:~ create a fat jar with lib @@ -34,8 +34,7 @@ declare function build:fatjar-with-lib($input-dir as xs:string,$manifest-jar as ,$lib) let $content:=(archive:extract-binary($bin,$name) ,$lib!file:read-binary($input-dir || .)) -return archive:create($name, $content, - map { "format" : "zip", "algorithm" : "deflate" }) +return archive:create($name, $content,$build:archive-opts) }; (:~ update-manifest :) @@ -61,15 +60,13 @@ as xs:base64Binary{ build:xar-add(map{},file:resolve-path("jars/",$base),"content/") =>build:xar-add(file:resolve-path("src/Pdfbox3.xqm",$base),"content/") =>build:xar-add(file:resolve-path("src/metadata/",$base),"") - return archive:create($entries?name, $entries?content, - map { "format" : "zip", "algorithm" : "deflate" }) + return archive:create($entries?name, $entries?content,$build:archive-opts) }; (:~ zip data for $dir :) declare function build:xar-add($map as map(*),$src as xs:string,$xar-dir as xs:string) as map(*){ -let $_:=trace(count($map?name),"size ") let $names:=if(file:is-dir($src)) then file:list($src)[not(starts-with(.,'.'))]!concat($src,.) else $src @@ -95,7 +92,7 @@ as empty-sequence(){ }; (:~ write-binary, creating dir if required :) -declare function build:write-binary($dest as xs:string,$contents) +declare function build:write-binary($dest as xs:string,$contents as xs:base64Binary?) as empty-sequence(){ file:create-dir(file:parent($dest)), file:write-binary($dest,$contents) diff --git a/scripts/make-xar.xq b/scripts/make-xar.xq index 1a2aa9a..8ce21fb 100644 --- a/scripts/make-xar.xq +++ b/scripts/make-xar.xq @@ -13,4 +13,4 @@ let $_:=build:maven-download($maven-urls,$base || "jars/") let $xar:=build:xar-create($base) let $output-file := file:resolve-path("dist/pdfbox.xar",$base) return (build:write-binary($output-file, $xar), - trace($output-file,"zar: ")) + trace($output-file,"xar: ")) diff --git a/scripts/maven.xqm b/scripts/maven.xqm new file mode 100644 index 0000000..bf4d10f --- /dev/null +++ b/scripts/maven.xqm @@ -0,0 +1,22 @@ +(:~ maven access + : + ::) +module namespace mvn = 'urn:quodatum:maven:1'; + + +declare variable $mvn:example := + org.ccil.cowan.tagsoup + tagsoup + 1.2.1 +; + +declare function mvn:url($dep as element(dependency),$ext as xs:string) +as xs:string { + + string-join( + ("https://repo.maven.apache.org/maven2/", + string-join($dep/*/string(), "/"), + "/",$dep/artifactId, "-", $dep/version, ".",$ext + )) + }; + diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index a6af956..1eda28c 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -4,8 +4,8 @@ pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library, requires pdfbox jar on classpath, tested with pdfbox-app-3.0.4.jar @see download https://pdfbox.apache.org/download.cgi @javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.4/ - :) + module namespace pdfbox="org.expkg_zone58.Pdfbox3"; declare namespace Loader ="java:org.apache.pdfbox.Loader"; @@ -28,21 +28,30 @@ declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; declare namespace File ="java:java.io.File"; +declare variable $pdfbox:package-version:="0.1.1"; + (:~ SemVer version of this package -with build metadata for Apacke Pdfbox in use e.g. "0.1.0+pdfbox3.0.4" +with build metadata for Apache Pdfbox in use e.g. "0.1.0+pdfbox3.0.4" :) declare function pdfbox:version() as xs:string{ - "0.1.0+pdfbox" || Q{java:org.apache.pdfbox.util.Version}getVersion() + $pdfbox:package-version ||"+pdfbox" || Q{java:org.apache.pdfbox.util.Version}getVersion() }; -(: open pdf,apply function, close pdf -with-document pattern, creates local pdfobject and ensures it is closed -e.g "path..." => pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) +(:~ with-document pattern: open pdf,apply function, close pdf + creates a local pdfobject and ensures it is closed after use +e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) :) -declare function pdfbox:with-pdf($src as xs:string,$fn as function(*)*) +declare function pdfbox:with-pdf($src as xs:string, + $fn as function(item())as item()*) as item()*{ - "@TODO" + let $pdf:=pdfbox:open($src) + return try{ + $fn($pdf),pdfbox:close($pdf) + } catch *{ + pdfbox:close($pdf),error() + } + }; (:~ open pdf, returns pdf object :) @@ -63,7 +72,7 @@ as xs:string{ PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() }; -(:~ save pdf $pdf to $savepath , returns $savepath :) +(:~ save pdf $pdf to filesystem at $savepath , returns $savepath :) declare function pdfbox:save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath @@ -97,7 +106,7 @@ as xs:base64Binary{ }; (:~ map with document metadata :) -declare function pdfbox:information($pdf as item()) +declare function pdfbox:metadata($pdf as item()) as map(*){ let $info:=PDDocument:getDocumentInformation($pdf) return map{ @@ -105,13 +114,36 @@ as map(*){ "creator": PDDocumentInformation:getCreator($info), "producer": PDDocumentInformation:getProducer($info), "subject": PDDocumentInformation:getSubject($info), - "keywords": PDDocumentInformation:getKeywords($info), - "creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate($info)), + "keywords": PDDocumentInformation:getKeywords($info), + "creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate($info)), "author": PDDocumentInformation:getAuthor($info) } }; +(:~ summary info as map for $pdfpath :) +declare function pdfbox:report($pdfpath as xs:string) +as map(*){ + let $pdf:=pdfbox:open($pdfpath) + return (map{ + "file": $pdfpath, + "pages": pdfbox:page-count($pdf), + "hasOutline": pdfbox:hasOutline($pdf), + "specification":pdfbox:specification($pdf) + },pdfbox:metadata($pdf) +)=>map:merge() +}; + + (:~ true if $pdf has an outline for $pdf as map()* :) +declare function pdfbox:hasOutline($pdf as item()) +as xs:boolean{ + (# db:wrapjava some #) { + let $outline:= + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getDocumentOutline() + return exists($outline) + } +}; (:~ outline for $pdf as map()* :) declare function pdfbox:outline($pdf as item()) @@ -236,17 +268,7 @@ as xs:string{ return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)} }; -(:~ summary info as map for $pdfpath :) -declare function pdfbox:report($pdfpath as xs:string) -as map(*){ - let $doc:=pdfbox:open($pdfpath) - return (map{ - "file": $pdfpath, - "pages": pdfbox:page-count($doc), - "outline": pdfbox:outline($doc)=>count() - },pdfbox:information($doc) -)=>map:merge() -}; + (:~ convert date :) declare %private diff --git a/src/metadata/expath-pkg.xml b/src/metadata/expath-pkg.xml index 0b18387..82d2c9b 100644 --- a/src/metadata/expath-pkg.xml +++ b/src/metadata/expath-pkg.xml @@ -1,7 +1,7 @@ BaseX interface to Pdfbox (https://pdfbox.apache.org/) version 3