diff --git a/.vscode/settings.json b/.vscode/settings.json index 13498c2..d566f68 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,6 @@ { "basexTools.xquery.profile": "basex-10", "basexTools.xquery.showHovers": false, + "basexTools.xquery.executionDefault": "basexclient", } \ No newline at end of file diff --git a/.xqdoca b/.xqdoca index df58509..fd65659 100644 --- a/.xqdoca +++ b/.xqdoca @@ -1,4 +1,4 @@ - jars/ + src/ docs/xqdoc/ \ No newline at end of file diff --git a/changelog.md b/changelog.md index 2df7f79..e410a5f 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,9 @@ +# 0.3.6 2025-05-31 +* Add metadata function +* rename page-size->page-media-box +# 0.3.1 2025-05-28 +* update to Apache pdfbox to 3.0.5 +* API name changes e.g. page-count->number-of-pages # 0.2.7 2025-02-18 * reduce memory use * add open from xs:base64Binary diff --git a/doc.md b/doc.md index 1b20032..d63321e 100644 --- a/doc.md +++ b/doc.md @@ -78,10 +78,10 @@ let $text := pdfbox:page-text($pdf, 1) (: Extract text from page 1 :) --- ### Rendering a Page as an Image -You can render a PDF page as an image using the `pdfbox:page-image` function. Supported formats include `jpg`, `png`, `bmp`, and `gif`. +You can render a PDF page as an image using the `pdfbox:page-render` function. Supported formats include `jpg`, `png`, `bmp`, and `gif`. ```xquery -let $image := pdfbox:page-image($pdf, 1, map{"format": "png", "scale": 2}) +let $image := pdfbox:page-render($pdf, 1, map{"format": "png", "scale": 2}) ``` - `format`: The image format (default is `jpg`). @@ -90,10 +90,10 @@ let $image := pdfbox:page-image($pdf, 1, map{"format": "png", "scale": 2}) --- ### Extracting a Range of Pages -To extract a range of pages from a PDF, use the `pdfbox:extract` function. +To extract a range of pages from a PDF, use the `pdfbox:extract-range` function. ```xquery -let $extracted := pdfbox:extract($pdf, 1, 3) (: Extract pages 1 to 3 :) +let $extracted := pdfbox:extract-range($pdf, 1, 3) (: Extract pages 1 to 3 :) ``` The result is a new PDF document in binary format. @@ -162,10 +162,10 @@ let $labels := pdfbox:labels($pdf) --- ### Getting Page Size -To get the size of a specific page, use the `pdfbox:page-size` function. +To get the size of a specific page, use the `pdfbox:page-media-box` function. ```xquery -let $size := pdfbox:page-size($pdf, 1) (: Get size of page 1 :) +let $size := pdfbox:page-media-box($pdf, 1) (: Get size of page 1 :) ``` --- diff --git a/docs/pdf.xqbk b/docs/pdf.xqbk new file mode 100644 index 0000000..76e2a23 --- /dev/null +++ b/docs/pdf.xqbk @@ -0,0 +1 @@ +{"cells":[{"kind":2,"language":"xquery","value":"import module namespace pdfbox=\"org.expkg_zone58.Pdfbox3\";\r\nlet $a:=pdfbox:open(\"C:\\Users\\mrwhe\\git\\expkg-zone58\\pdfbox\\data\\1e\\gpg-book\\2-5-1\\B4541C-TRD\\255894---Book_File-Web_PDF_9798400668005_486272.pdf\")\r\nreturn pdfbox:labels($a)"}]} \ No newline at end of file diff --git a/package.json b/package.json index 58a8beb..02f6fc7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdfbox", - "version": "0.2.7", + "version": "0.3.6", "description": "A BaseX interface to Apache Pdfbox version 3", "main": "src/Pdfbox3.xqm", "homepage": "https://github.com/expkg-zone58/pdfbox#readme", @@ -8,8 +8,9 @@ "doc": "docs" }, "scripts": { - "test": "%BASEX10%/bin/basex -Wt tests", - "docs": "xqdoca" + "test": "%BASEX10%/bin/basex -Wt tests", + "docs": "xqdoca", + "build": "%BASEX10%/bin/basex scripts/make-xar.xq" }, "keywords": [ "pdf", @@ -22,11 +23,13 @@ "expkg_zone58": { "namespace": "org.expkg_zone58.Pdfbox3", "main-class": "org.apache.pdfbox.pdmodel.PDDocument", - "maven": [ - "org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar", - "org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar", - "org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar", - "commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar" + "manifest-jar" :"pdfbox-3.0.5.jar", + "output" : "dist/pdfbox-3.0.5.fat.jar", + "maven2": [ + "org.apache.pdfbox:pdfbox:3.0.5", + "org.apache.pdfbox:pdfbox-io:3.0.5", + "org.apache.pdfbox:fontbox:3.0.5", + "commons-logging:commons-logging:1.3.5" ] } diff --git a/readme.md b/readme.md index 53c084d..6dd35d0 100644 --- a/readme.md +++ b/readme.md @@ -20,12 +20,12 @@ The features focus on extracting information from PDFs rather than creation or e * save pdf page range to a new pdf. * save image of rendered pdf page. * open PDF with password +* read XMP metadata +* Page size information * support for xs:base64Binary in function inputs and outputs to facilitate database and store usage. ### Not supported: -* creating completely new PDFs -* Page size information -* XMP processing +* creating PDFs with new content * Form processing ## Documentation @@ -41,7 +41,7 @@ import module namespace pdfbox="org.expkg_zone58.Pdfbox3"; pdfbox:with-pdf("...path/to/pdf.pdf", function($pdf){ - (1 to pdfbox:page-count($pdf))!pdfbox:page-text($pdf,.) + (1 to pdfbox:number-of-pages($pdf))!pdfbox:page-text($pdf,.) } ) ``` diff --git a/scripts/build.xqm b/scripts/build.xqm index a4e40d3..35c65c5 100644 --- a/scripts/build.xqm +++ b/scripts/build.xqm @@ -9,6 +9,8 @@ declare namespace pkg='http://expath.org/ns/pkg'; declare variable $build:archive-opts:= map { "format" : "zip", "algorithm" : "deflate" }; declare variable $build:base:= file:resolve-path("../",static-base-uri())=>trace("base "); + +(:~ load "npm style" package.json :) declare variable $build:PKG:=json:doc(file:resolve-path("package.json",$build:base),map{"format":"xquery"}); (:~ return binary for fat jar from jars in $input-dir @@ -94,7 +96,7 @@ as xs:string{ declare function build:xar-create() as xs:base64Binary{ - let $_:=build:maven-download($build:PKG?expkg_zone58?maven=>array:flatten(),$build:base || "jars/") + let $_:=build:maven-download($build:PKG?expkg_zone58?maven2=>array:flatten(),$build:base || "jars/") let $entries:= build:xar-add(map{},build:jars("content"),build:jars("download")!build:content(.)) =>build:xar-add("content/Pdfbox3.xqm",build:content("src/Pdfbox3.xqm")) @@ -124,27 +126,45 @@ as xs:string{ declare function build:jars($style as xs:string) as xs:string*{ -let $src:=$build:PKG?expkg_zone58?maven=>array:flatten() -let $names:= $src!replace(.,"^.*/","") +let $artifacts:=$build:PKG?expkg_zone58?maven2=>array:flatten() +let $names:= $artifacts!build:maven-slug(.)!file:name(.) return switch($style) case "name" return $names case "download" return $names!concat("jars/",.) case "content" return $names!concat("content/",.) -default return $src +default return $names }; (:~ download $files from $urls to $destdir:) declare variable $build:REPO as xs:string external :="https://repo1.maven.org/maven2/"; -declare function build:maven-download($urls as xs:string*,$destdir as xs:string) + +declare function build:maven-download($artifacts as xs:string*,$destdir as xs:string) as empty-sequence(){ file:create-dir($destdir), - for $f in $urls - let $dest:=$destdir || replace($f,"^.*/","") + for $id in $artifacts + let $slug:=build:maven-slug($id) + let $dest:=$destdir || file:name($slug) where not(file:exists($dest)) - return build:write-binary($dest, fetch:binary(resolve-uri($f,$build:REPO) + return build:write-binary($dest, fetch:binary(resolve-uri($slug,$build:REPO) =>trace("Download: "))) }; +(:~ non-rooted url for maven artifact :) +declare function build:maven-slug($artifact as xs:string) +as xs:string{ + + let $parts:=if(matches($artifact,'[^:]+:[^:]+:[^:]+')) + then tokenize($artifact,":") + else error(xs:QName('build:maven-slug'),"invalid format required 'groupId:id:version'") + + return ( + translate($parts[1],".","/"), + $parts[2], + $parts[3], + string-join(($parts[2] , "-" , $parts[3] , ".jar"),"") + )=>string-join("/") +}; + (:~ write-binary, creating dir if required :) declare function build:write-binary($dest as xs:string,$contents as xs:base64Binary?) as empty-sequence(){ diff --git a/scripts/make-fat-jar.xq b/scripts/make-fat-jar.xq index b327816..adc3212 100644 --- a/scripts/make-fat-jar.xq +++ b/scripts/make-fat-jar.xq @@ -3,29 +3,18 @@ import module namespace build = 'urn:quodatum:build1' at 'build.xqm'; declare variable $base:= file:resolve-path("../",static-base-uri())=>trace("base "); -declare variable $maven-urls := ( -"org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar", -"org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar", -"org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar", -"commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar" -); - -let $config :=map { - "manifest-jar" : "pdfbox-3.0.4.jar", - "input-dir" : "jars/", - "output" : "dist/pdfbox-3.0.4.fat.jar", - "main-class": "org.expkg_zone58.Pdfbox3" - } -let $jar-path:=file:resolve-path($config?input-dir,$base)=>trace("jar: ") -let $_:=build:maven-download($maven-urls,$jar-path) -let $fat-jar := build:fatjar-from-folder($jar-path,$config?manifest-jar) +let $jar-path:=$build:base || "jars/"=>trace("jar: ") + let $_:=build:maven-download($build:PKG?expkg_zone58?maven2=>array:flatten(), + $build:base || "jars/") -let $fat-jar:=build:update-manifest($fat-jar, $config?main-class) -let $name:=replace($config?main-class,"\.","/") || ".xqm" +let $fat-jar := build:fatjar-from-folder($jar-path,$build:PKG?expkg_zone58?manifest-jar) + +let $fat-jar:=build:update-manifest($fat-jar, $build:PKG?expkg_zone58?main-class) +let $name:=replace($build:PKG?expkg_zone58?main-class,"\.","/") || ".xqm" let $content:=file:read-binary($base || "src/Pdfbox3.xqm") let $fat-jar:=archive:update($fat-jar, $name,$content) -let $output-file := file:resolve-path($config?output,$base) +let $output-file := file:resolve-path($build:PKG?expkg_zone58?output,$base) return (build:write-binary($output-file, $fat-jar), trace($output-file,"fat jar: ")) \ No newline at end of file diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index 081eb19..970cc31 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -1,10 +1,10 @@ xquery version '3.1'; (:~ -pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library, +A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.4.jar -@see download https://pdfbox.apache.org/download.cgi -@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.4/ +tested with pdfbox-app-3.0.5.jar +@see https://pdfbox.apache.org/download.cgi +@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/ @author Andy Bunce 2025 :) @@ -16,12 +16,18 @@ declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; -declare namespace PDPage ="org.apache.pdfbox.pdmodel.PDPage"; +declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage"; declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline"; declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; +declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata"; +declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream"; + +declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + + declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle"; @@ -54,8 +60,8 @@ pdfbox:open($pdfsrc, map{}) }; (:~ open pdf from file/url/binary, opts may have password , returns pdf object -@param $pdfsrc a fetchable url or a xs:base64Binary -@param $opts map {"password":} +@param $pdfsrc a fetchable url or filepath, or xs:base64Binary item +@param $opts options otionally with map {"password":} :) declare function pdfbox:open($pdfsrc as item(), $opts as map(*)) as item(){ @@ -75,7 +81,7 @@ as item(){ } }; -(:~ the version of the PDF specification used by $pdf e.g "1.4" +(:~ The version of the PDF specification used by $pdf e.g "1.4" returned as string to avoid float rounding issues :) declare function pdfbox:specification($pdf as item()) @@ -83,13 +89,13 @@ as xs:string{ PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() }; -(:~ save pdf $pdf to filesystem at $savepath , returns $savepath :) +(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) declare function pdfbox:save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath }; -(:~ $pdf as xs:base64Binary :) +(:~ Create binary representation of $pdf object as xs:base64Binary :) declare function pdfbox:binary($pdf as item()) as xs:base64Binary{ let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() @@ -98,7 +104,7 @@ as xs:base64Binary{ =>convert:integers-to-base64() }; -(: release references to $pdf:) +(:~ Release any resources related to $pdf:) declare function pdfbox:close($pdf as item()) as empty-sequence(){ (# db:wrapjava void #) { @@ -106,15 +112,15 @@ as empty-sequence(){ } }; -(:~ number of pages in PDF:) -declare function pdfbox:page-count($pdf as item()) +(:~ Number of pages in PDF:) +declare function pdfbox:number-of-pages($pdf as item()) as xs:integer{ PDDocument:getNumberOfPages($pdf) }; -(:~ render of $pdf page to image +(:~ Pdf page as image (zero is cover) options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) -declare function pdfbox:page-image($pdf as item(),$pageNo as xs:integer,$options as map(*)) +declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) as xs:base64Binary{ let $options:=map:merge(($options,map{"format":"jpg","scale":1})) let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) @@ -125,12 +131,13 @@ as xs:base64Binary{ }; + (:~ property access map keys are property names, values are sequences of functions to get property from $pdf object :) declare %private variable $pdfbox:property-map:=map{ - "pageCount": pdfbox:page-count#1, + "pageCount": pdfbox:number-of-pages#1, "hasOutline": pdfbox:hasOutline#1, @@ -166,7 +173,7 @@ declare %private variable $pdfbox:property-map:=map{ }; (:~ known property names sorted :) -declare function pdfbox:defined-properties() +declare function pdfbox:property-names() as xs:string*{ $pdfbox:property-map=>map:keys()=>sort() }; @@ -233,6 +240,37 @@ as xs:boolean{ =>exists() }; +(:~ XMP metadata as "RDF" document +@note usually rdf:RDF root, but sometimes x:xmpmeta +:) +declare function pdfbox:metadata($pdf as item()) +as document-node(element(*))? +{ + let $m:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getMetadata() + return if(exists($m)) + then + let $is:=PDMetadata:exportXMPMetadata($m) + return pdfbox:do-until( + map{"n":0,"data":""}, + + function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, + + function($output,$pos) { $output?n eq -1 } + )?data=>parse-xml() + else () +}; + +(:~ read next block from XMP stream :) +declare %private function pdfbox:read-stream($is,$read as xs:string) +as map(*){ + let $blen:=4096 + let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) + let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) + let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() + return map{"n":$n, "data": $read || $data} +}; + (:~ outline for $pdf as map()* :) declare function pdfbox:outline($pdf as item()) as map(*)*{ @@ -275,7 +313,7 @@ as map(*){ ) }; -(:~ outline as xml :) +(:~ PDF outline in xml format :) declare function pdfbox:outline-xml($pdf as item()) as element(outline)?{ let $outline:=pdfbox:outline($pdf) @@ -284,6 +322,7 @@ as element(outline)?{ else () }; +(:~ recursive ouutline map to XML :) declare %private function pdfbox:bookmark-xml($outline as map(*)*) as element(bookmark)* { @@ -293,8 +332,8 @@ as element(bookmark)* }; -(:~ return bookmark info for children of $outlineItem -@return map like{index:,title:,hasChildren:} +(:~ return bookmark info for $bookmark +@return map{index:..,title:..,hasChildren:..} :) declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) as map(*) @@ -320,8 +359,11 @@ as item()? =>PDPageTree:indexOf($page) }; -(:~ new PDF doc from 1 based page range as xs:base64Binary :) -declare function pdfbox:extract($pdf as item(), +(:~ Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) +@param $start first page to include +@param $end last page to include +:) +declare function pdfbox:extract-range($pdf as item(), $start as xs:integer,$end as xs:integer) as xs:base64Binary { @@ -355,15 +397,17 @@ as xs:string{ return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} }; -(:~ return size of $pageNo zero based :) -declare function pdfbox:page-size($pdf as item(), $pageNo as xs:integer) +(:~ return size of $pageNo (zero based) +@result e.g. [0.0,0.0,168.0,239.52] + :) +declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) as xs:string{ PDDocument:getPage($pdf, $pageNo) =>PDPage:getMediaBox() =>PDRectangle:toString() }; -(:~ version of Apache Pdfbox in use e.g. "3.0.4" :) +(:~ version of Apache Pdfbox in use e.g. "3.0.4" :) declare function pdfbox:version() as xs:string{ Q{java:org.apache.pdfbox.util.Version}getVersion() @@ -379,7 +423,7 @@ as xs:string?{ }; (:~ fn:do-until shim for BaseX 9+10 -if fn:do-until not found use hof:until +if fn:do-until not found use hof:until, note: $pos always zero :) declare %private function pdfbox:do-until( $input as item()*, diff --git a/tests/test.xqm b/tests/test.xqm index 0c12710..61d9f91 100644 --- a/tests/test.xqm +++ b/tests/test.xqm @@ -9,7 +9,7 @@ declare variable $test:base:=file:base-dir()=>file:parent(); declare %unit:test function test:pdfbox-version(){ let $v:= pdfbox:version()=>trace("VER: ") - return unit:assert-equals($v,"3.0.4") + return unit:assert-equals($v,"3.0.5") }; declare %unit:test @@ -22,7 +22,7 @@ function test:specification(){ declare %unit:test function test:page-count(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") - let $pages:=pdfbox:page-count($pdf) + let $pages:=pdfbox:number-of-pages($pdf) return unit:assert-equals($pages,521) }; @@ -53,7 +53,7 @@ function test:labels(){ let $labels:=pdfbox:labels($pdf) return ( - unit:assert-equals(count($labels),pdfbox:page-count($pdf)), + unit:assert-equals(count($labels),pdfbox:number-of-pages($pdf)), unit:assert($labels[1]="i") , unit:assert($labels[27]="1") ) @@ -63,7 +63,7 @@ declare %unit:test function test:extract(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") let $dest:=file:create-temp-file("test",".pdf")=>trace("DEST: ") - let $bin:=pdfbox:extract($pdf,2,12) + let $bin:=pdfbox:extract-range($pdf,2,12) return unit:assert(true()) }; @@ -77,7 +77,7 @@ let $pdf:=test:open("samples.pdf/BaseX100.pdf") declare %unit:test function test:page-image(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") - let $image:=pdfbox:page-image($pdf,0,map{}) + let $image:=pdfbox:page-render($pdf,0,map{}) return unit:assert(true()) }; @@ -94,7 +94,7 @@ declare %unit:test function test:with-url(){ let $url:="https://files.basex.org/publications/Gath%20et%20al.%20%5b2009%5d,%20INEX%20Efficiency%20Track%20meets%20XQuery%20Full%20Text%20in%20BaseX.pdf" - let $count:=pdfbox:with-pdf($url,pdfbox:page-count#1) + let $count:=pdfbox:with-pdf($url,pdfbox:number-of-pages#1) return unit:assert-equals($count,6) }; @@ -141,13 +141,13 @@ function test:property(){ declare %unit:test("expected", "pdfbox:property") function test:property-bad(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") - let $title:=pdfbox:property($pdf, "totle") + let $title:=pdfbox:property($pdf, "badname") return unit:assert(exists($title)) }; -(:~ Test for pdfbox:defined-properties function :) +(:~ Test for pdfbox:property-names function :) declare %unit:test function test:defined-properties(){ - let $properties:=pdfbox:defined-properties() + let $properties:=pdfbox:property-names() return unit:assert(exists($properties)) };