From ed84a3b3421316edc52084dcf8f544281120f498 Mon Sep 17 00:00:00 2001 From: Andy Bunce Date: Sun, 1 Jun 2025 18:00:17 +0100 Subject: [PATCH] [mod] 0.3.6 --- changelog.md | 6 ++++++ doc.md | 4 ++-- package.json | 7 ++++--- readme.md | 6 +++--- src/Pdfbox3.xqm | 55 +++++++++++++++++++++++++++++++++++++++++++------ 5 files changed, 64 insertions(+), 14 deletions(-) diff --git a/changelog.md b/changelog.md index 2df7f79..e410a5f 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,9 @@ +# 0.3.6 2025-05-31 +* Add metadata function +* rename page-size->page-media-box +# 0.3.1 2025-05-28 +* update to Apache pdfbox to 3.0.5 +* API name changes e.g. page-count->number-of-pages # 0.2.7 2025-02-18 * reduce memory use * add open from xs:base64Binary diff --git a/doc.md b/doc.md index d061fb2..d63321e 100644 --- a/doc.md +++ b/doc.md @@ -162,10 +162,10 @@ let $labels := pdfbox:labels($pdf) --- ### Getting Page Size -To get the size of a specific page, use the `pdfbox:page-size` function. +To get the size of a specific page, use the `pdfbox:page-media-box` function. ```xquery -let $size := pdfbox:page-size($pdf, 1) (: Get size of page 1 :) +let $size := pdfbox:page-media-box($pdf, 1) (: Get size of page 1 :) ``` --- diff --git a/package.json b/package.json index c7b224e..02f6fc7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdfbox", - "version": "0.3.1", + "version": "0.3.6", "description": "A BaseX interface to Apache Pdfbox version 3", "main": "src/Pdfbox3.xqm", "homepage": "https://github.com/expkg-zone58/pdfbox#readme", @@ -9,7 +9,8 @@ }, "scripts": { "test": "%BASEX10%/bin/basex -Wt tests", - "docs": "xqdoca" + "docs": "xqdoca", + "build": "%BASEX10%/bin/basex scripts/make-xar.xq" }, "keywords": [ "pdf", @@ -22,7 +23,7 @@ "expkg_zone58": { "namespace": "org.expkg_zone58.Pdfbox3", "main-class": "org.apache.pdfbox.pdmodel.PDDocument", - "manifest-jar" :"pdfbox-3.0.4.jar", + "manifest-jar" :"pdfbox-3.0.5.jar", "output" : "dist/pdfbox-3.0.5.fat.jar", "maven2": [ "org.apache.pdfbox:pdfbox:3.0.5", diff --git a/readme.md b/readme.md index 309d7d1..6dd35d0 100644 --- a/readme.md +++ b/readme.md @@ -20,12 +20,12 @@ The features focus on extracting information from PDFs rather than creation or e * save pdf page range to a new pdf. * save image of rendered pdf page. * open PDF with password +* read XMP metadata +* Page size information * support for xs:base64Binary in function inputs and outputs to facilitate database and store usage. ### Not supported: -* creating completely new PDFs -* Page size information -* XMP processing +* creating PDFs with new content * Form processing ## Documentation diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index d9a41a8..970cc31 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -3,7 +3,7 @@ xquery version '3.1'; A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , requires pdfbox jars on classpath, i.e. in custom or xar tested with pdfbox-app-3.0.5.jar -@see download +@see https://pdfbox.apache.org/download.cgi @javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/ @author Andy Bunce 2025 :) @@ -22,6 +22,12 @@ declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; +declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata"; +declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream"; + +declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + + declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle"; @@ -89,7 +95,7 @@ as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath }; -(:~ Create binary representation of $pdf as xs:base64Binary :) +(:~ Create binary representation of $pdf object as xs:base64Binary :) declare function pdfbox:binary($pdf as item()) as xs:base64Binary{ let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() @@ -125,6 +131,7 @@ as xs:base64Binary{ }; + (:~ property access map keys are property names, values are sequences of functions to get property from $pdf object @@ -233,6 +240,37 @@ as xs:boolean{ =>exists() }; +(:~ XMP metadata as "RDF" document +@note usually rdf:RDF root, but sometimes x:xmpmeta +:) +declare function pdfbox:metadata($pdf as item()) +as document-node(element(*))? +{ + let $m:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getMetadata() + return if(exists($m)) + then + let $is:=PDMetadata:exportXMPMetadata($m) + return pdfbox:do-until( + map{"n":0,"data":""}, + + function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, + + function($output,$pos) { $output?n eq -1 } + )?data=>parse-xml() + else () +}; + +(:~ read next block from XMP stream :) +declare %private function pdfbox:read-stream($is,$read as xs:string) +as map(*){ + let $blen:=4096 + let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) + let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) + let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() + return map{"n":$n, "data": $read || $data} +}; + (:~ outline for $pdf as map()* :) declare function pdfbox:outline($pdf as item()) as map(*)*{ @@ -321,7 +359,10 @@ as item()? =>PDPageTree:indexOf($page) }; -(:~ Return new extract PDF doc as xs:base64Binary, using a 1 based page range :) +(:~ Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) +@param $start first page to include +@param $end last page to include +:) declare function pdfbox:extract-range($pdf as item(), $start as xs:integer,$end as xs:integer) as xs:base64Binary @@ -356,8 +397,10 @@ as xs:string{ return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} }; -(:~ return size of $pageNo (zero is cover :) -declare function pdfbox:page-size($pdf as item(), $pageNo as xs:integer) +(:~ return size of $pageNo (zero based) +@result e.g. [0.0,0.0,168.0,239.52] + :) +declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) as xs:string{ PDDocument:getPage($pdf, $pageNo) =>PDPage:getMediaBox() @@ -380,7 +423,7 @@ as xs:string?{ }; (:~ fn:do-until shim for BaseX 9+10 -if fn:do-until not found use hof:until +if fn:do-until not found use hof:until, note: $pos always zero :) declare %private function pdfbox:do-until( $input as item()*,