diff --git a/.vscode/settings.json b/.vscode/settings.json index d566f68..13498c2 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,5 @@ { "basexTools.xquery.profile": "basex-10", "basexTools.xquery.showHovers": false, - "basexTools.xquery.executionDefault": "basexclient", } \ No newline at end of file diff --git a/.xqdoca b/.xqdoca index fd65659..df58509 100644 --- a/.xqdoca +++ b/.xqdoca @@ -1,4 +1,4 @@ - src/ + jars/ docs/xqdoc/ \ No newline at end of file diff --git a/changelog.md b/changelog.md index e410a5f..2df7f79 100644 --- a/changelog.md +++ b/changelog.md @@ -1,9 +1,3 @@ -# 0.3.6 2025-05-31 -* Add metadata function -* rename page-size->page-media-box -# 0.3.1 2025-05-28 -* update to Apache pdfbox to 3.0.5 -* API name changes e.g. page-count->number-of-pages # 0.2.7 2025-02-18 * reduce memory use * add open from xs:base64Binary diff --git a/doc.md b/doc.md index d63321e..1b20032 100644 --- a/doc.md +++ b/doc.md @@ -78,10 +78,10 @@ let $text := pdfbox:page-text($pdf, 1) (: Extract text from page 1 :) --- ### Rendering a Page as an Image -You can render a PDF page as an image using the `pdfbox:page-render` function. Supported formats include `jpg`, `png`, `bmp`, and `gif`. +You can render a PDF page as an image using the `pdfbox:page-image` function. Supported formats include `jpg`, `png`, `bmp`, and `gif`. ```xquery -let $image := pdfbox:page-render($pdf, 1, map{"format": "png", "scale": 2}) +let $image := pdfbox:page-image($pdf, 1, map{"format": "png", "scale": 2}) ``` - `format`: The image format (default is `jpg`). @@ -90,10 +90,10 @@ let $image := pdfbox:page-render($pdf, 1, map{"format": "png", "scale": 2}) --- ### Extracting a Range of Pages -To extract a range of pages from a PDF, use the `pdfbox:extract-range` function. +To extract a range of pages from a PDF, use the `pdfbox:extract` function. ```xquery -let $extracted := pdfbox:extract-range($pdf, 1, 3) (: Extract pages 1 to 3 :) +let $extracted := pdfbox:extract($pdf, 1, 3) (: Extract pages 1 to 3 :) ``` The result is a new PDF document in binary format. @@ -162,10 +162,10 @@ let $labels := pdfbox:labels($pdf) --- ### Getting Page Size -To get the size of a specific page, use the `pdfbox:page-media-box` function. +To get the size of a specific page, use the `pdfbox:page-size` function. ```xquery -let $size := pdfbox:page-media-box($pdf, 1) (: Get size of page 1 :) +let $size := pdfbox:page-size($pdf, 1) (: Get size of page 1 :) ``` --- diff --git a/docs/pdf.xqbk b/docs/pdf.xqbk deleted file mode 100644 index 76e2a23..0000000 --- a/docs/pdf.xqbk +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"kind":2,"language":"xquery","value":"import module namespace pdfbox=\"org.expkg_zone58.Pdfbox3\";\r\nlet $a:=pdfbox:open(\"C:\\Users\\mrwhe\\git\\expkg-zone58\\pdfbox\\data\\1e\\gpg-book\\2-5-1\\B4541C-TRD\\255894---Book_File-Web_PDF_9798400668005_486272.pdf\")\r\nreturn pdfbox:labels($a)"}]} \ No newline at end of file diff --git a/package.json b/package.json index 02f6fc7..58a8beb 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdfbox", - "version": "0.3.6", + "version": "0.2.7", "description": "A BaseX interface to Apache Pdfbox version 3", "main": "src/Pdfbox3.xqm", "homepage": "https://github.com/expkg-zone58/pdfbox#readme", @@ -8,9 +8,8 @@ "doc": "docs" }, "scripts": { - "test": "%BASEX10%/bin/basex -Wt tests", - "docs": "xqdoca", - "build": "%BASEX10%/bin/basex scripts/make-xar.xq" + "test": "%BASEX10%/bin/basex -Wt tests", + "docs": "xqdoca" }, "keywords": [ "pdf", @@ -23,13 +22,11 @@ "expkg_zone58": { "namespace": "org.expkg_zone58.Pdfbox3", "main-class": "org.apache.pdfbox.pdmodel.PDDocument", - "manifest-jar" :"pdfbox-3.0.5.jar", - "output" : "dist/pdfbox-3.0.5.fat.jar", - "maven2": [ - "org.apache.pdfbox:pdfbox:3.0.5", - "org.apache.pdfbox:pdfbox-io:3.0.5", - "org.apache.pdfbox:fontbox:3.0.5", - "commons-logging:commons-logging:1.3.5" + "maven": [ + "org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar", + "org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar", + "org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar", + "commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar" ] } diff --git a/readme.md b/readme.md index 6dd35d0..53c084d 100644 --- a/readme.md +++ b/readme.md @@ -20,12 +20,12 @@ The features focus on extracting information from PDFs rather than creation or e * save pdf page range to a new pdf. * save image of rendered pdf page. * open PDF with password -* read XMP metadata -* Page size information * support for xs:base64Binary in function inputs and outputs to facilitate database and store usage. ### Not supported: -* creating PDFs with new content +* creating completely new PDFs +* Page size information +* XMP processing * Form processing ## Documentation @@ -41,7 +41,7 @@ import module namespace pdfbox="org.expkg_zone58.Pdfbox3"; pdfbox:with-pdf("...path/to/pdf.pdf", function($pdf){ - (1 to pdfbox:number-of-pages($pdf))!pdfbox:page-text($pdf,.) + (1 to pdfbox:page-count($pdf))!pdfbox:page-text($pdf,.) } ) ``` diff --git a/scripts/build.xqm b/scripts/build.xqm index 35c65c5..a4e40d3 100644 --- a/scripts/build.xqm +++ b/scripts/build.xqm @@ -9,8 +9,6 @@ declare namespace pkg='http://expath.org/ns/pkg'; declare variable $build:archive-opts:= map { "format" : "zip", "algorithm" : "deflate" }; declare variable $build:base:= file:resolve-path("../",static-base-uri())=>trace("base "); - -(:~ load "npm style" package.json :) declare variable $build:PKG:=json:doc(file:resolve-path("package.json",$build:base),map{"format":"xquery"}); (:~ return binary for fat jar from jars in $input-dir @@ -96,7 +94,7 @@ as xs:string{ declare function build:xar-create() as xs:base64Binary{ - let $_:=build:maven-download($build:PKG?expkg_zone58?maven2=>array:flatten(),$build:base || "jars/") + let $_:=build:maven-download($build:PKG?expkg_zone58?maven=>array:flatten(),$build:base || "jars/") let $entries:= build:xar-add(map{},build:jars("content"),build:jars("download")!build:content(.)) =>build:xar-add("content/Pdfbox3.xqm",build:content("src/Pdfbox3.xqm")) @@ -126,45 +124,27 @@ as xs:string{ declare function build:jars($style as xs:string) as xs:string*{ -let $artifacts:=$build:PKG?expkg_zone58?maven2=>array:flatten() -let $names:= $artifacts!build:maven-slug(.)!file:name(.) +let $src:=$build:PKG?expkg_zone58?maven=>array:flatten() +let $names:= $src!replace(.,"^.*/","") return switch($style) case "name" return $names case "download" return $names!concat("jars/",.) case "content" return $names!concat("content/",.) -default return $names +default return $src }; (:~ download $files from $urls to $destdir:) declare variable $build:REPO as xs:string external :="https://repo1.maven.org/maven2/"; - -declare function build:maven-download($artifacts as xs:string*,$destdir as xs:string) +declare function build:maven-download($urls as xs:string*,$destdir as xs:string) as empty-sequence(){ file:create-dir($destdir), - for $id in $artifacts - let $slug:=build:maven-slug($id) - let $dest:=$destdir || file:name($slug) + for $f in $urls + let $dest:=$destdir || replace($f,"^.*/","") where not(file:exists($dest)) - return build:write-binary($dest, fetch:binary(resolve-uri($slug,$build:REPO) + return build:write-binary($dest, fetch:binary(resolve-uri($f,$build:REPO) =>trace("Download: "))) }; -(:~ non-rooted url for maven artifact :) -declare function build:maven-slug($artifact as xs:string) -as xs:string{ - - let $parts:=if(matches($artifact,'[^:]+:[^:]+:[^:]+')) - then tokenize($artifact,":") - else error(xs:QName('build:maven-slug'),"invalid format required 'groupId:id:version'") - - return ( - translate($parts[1],".","/"), - $parts[2], - $parts[3], - string-join(($parts[2] , "-" , $parts[3] , ".jar"),"") - )=>string-join("/") -}; - (:~ write-binary, creating dir if required :) declare function build:write-binary($dest as xs:string,$contents as xs:base64Binary?) as empty-sequence(){ diff --git a/scripts/make-fat-jar.xq b/scripts/make-fat-jar.xq index adc3212..b327816 100644 --- a/scripts/make-fat-jar.xq +++ b/scripts/make-fat-jar.xq @@ -3,18 +3,29 @@ import module namespace build = 'urn:quodatum:build1' at 'build.xqm'; declare variable $base:= file:resolve-path("../",static-base-uri())=>trace("base "); +declare variable $maven-urls := ( +"org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar", +"org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar", +"org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar", +"commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar" +); + +let $config :=map { + "manifest-jar" : "pdfbox-3.0.4.jar", + "input-dir" : "jars/", + "output" : "dist/pdfbox-3.0.4.fat.jar", + "main-class": "org.expkg_zone58.Pdfbox3" + } -let $jar-path:=$build:base || "jars/"=>trace("jar: ") - let $_:=build:maven-download($build:PKG?expkg_zone58?maven2=>array:flatten(), - $build:base || "jars/") +let $jar-path:=file:resolve-path($config?input-dir,$base)=>trace("jar: ") +let $_:=build:maven-download($maven-urls,$jar-path) +let $fat-jar := build:fatjar-from-folder($jar-path,$config?manifest-jar) -let $fat-jar := build:fatjar-from-folder($jar-path,$build:PKG?expkg_zone58?manifest-jar) - -let $fat-jar:=build:update-manifest($fat-jar, $build:PKG?expkg_zone58?main-class) -let $name:=replace($build:PKG?expkg_zone58?main-class,"\.","/") || ".xqm" +let $fat-jar:=build:update-manifest($fat-jar, $config?main-class) +let $name:=replace($config?main-class,"\.","/") || ".xqm" let $content:=file:read-binary($base || "src/Pdfbox3.xqm") let $fat-jar:=archive:update($fat-jar, $name,$content) -let $output-file := file:resolve-path($build:PKG?expkg_zone58?output,$base) +let $output-file := file:resolve-path($config?output,$base) return (build:write-binary($output-file, $fat-jar), trace($output-file,"fat jar: ")) \ No newline at end of file diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index 970cc31..081eb19 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -1,10 +1,10 @@ xquery version '3.1'; (:~ -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , +pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library, requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.5.jar -@see https://pdfbox.apache.org/download.cgi -@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/ +tested with pdfbox-app-3.0.4.jar +@see download https://pdfbox.apache.org/download.cgi +@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.4/ @author Andy Bunce 2025 :) @@ -16,18 +16,12 @@ declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; -declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage"; +declare namespace PDPage ="org.apache.pdfbox.pdmodel.PDPage"; declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline"; declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; -declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata"; -declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream"; - -declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; - - declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle"; @@ -60,8 +54,8 @@ pdfbox:open($pdfsrc, map{}) }; (:~ open pdf from file/url/binary, opts may have password , returns pdf object -@param $pdfsrc a fetchable url or filepath, or xs:base64Binary item -@param $opts options otionally with map {"password":} +@param $pdfsrc a fetchable url or a xs:base64Binary +@param $opts map {"password":} :) declare function pdfbox:open($pdfsrc as item(), $opts as map(*)) as item(){ @@ -81,7 +75,7 @@ as item(){ } }; -(:~ The version of the PDF specification used by $pdf e.g "1.4" +(:~ the version of the PDF specification used by $pdf e.g "1.4" returned as string to avoid float rounding issues :) declare function pdfbox:specification($pdf as item()) @@ -89,13 +83,13 @@ as xs:string{ PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() }; -(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) +(:~ save pdf $pdf to filesystem at $savepath , returns $savepath :) declare function pdfbox:save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath }; -(:~ Create binary representation of $pdf object as xs:base64Binary :) +(:~ $pdf as xs:base64Binary :) declare function pdfbox:binary($pdf as item()) as xs:base64Binary{ let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() @@ -104,7 +98,7 @@ as xs:base64Binary{ =>convert:integers-to-base64() }; -(:~ Release any resources related to $pdf:) +(: release references to $pdf:) declare function pdfbox:close($pdf as item()) as empty-sequence(){ (# db:wrapjava void #) { @@ -112,15 +106,15 @@ as empty-sequence(){ } }; -(:~ Number of pages in PDF:) -declare function pdfbox:number-of-pages($pdf as item()) +(:~ number of pages in PDF:) +declare function pdfbox:page-count($pdf as item()) as xs:integer{ PDDocument:getNumberOfPages($pdf) }; -(:~ Pdf page as image (zero is cover) +(:~ render of $pdf page to image options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) -declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) +declare function pdfbox:page-image($pdf as item(),$pageNo as xs:integer,$options as map(*)) as xs:base64Binary{ let $options:=map:merge(($options,map{"format":"jpg","scale":1})) let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) @@ -131,13 +125,12 @@ as xs:base64Binary{ }; - (:~ property access map keys are property names, values are sequences of functions to get property from $pdf object :) declare %private variable $pdfbox:property-map:=map{ - "pageCount": pdfbox:number-of-pages#1, + "pageCount": pdfbox:page-count#1, "hasOutline": pdfbox:hasOutline#1, @@ -173,7 +166,7 @@ declare %private variable $pdfbox:property-map:=map{ }; (:~ known property names sorted :) -declare function pdfbox:property-names() +declare function pdfbox:defined-properties() as xs:string*{ $pdfbox:property-map=>map:keys()=>sort() }; @@ -240,37 +233,6 @@ as xs:boolean{ =>exists() }; -(:~ XMP metadata as "RDF" document -@note usually rdf:RDF root, but sometimes x:xmpmeta -:) -declare function pdfbox:metadata($pdf as item()) -as document-node(element(*))? -{ - let $m:=PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getMetadata() - return if(exists($m)) - then - let $is:=PDMetadata:exportXMPMetadata($m) - return pdfbox:do-until( - map{"n":0,"data":""}, - - function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, - - function($output,$pos) { $output?n eq -1 } - )?data=>parse-xml() - else () -}; - -(:~ read next block from XMP stream :) -declare %private function pdfbox:read-stream($is,$read as xs:string) -as map(*){ - let $blen:=4096 - let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) - let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) - let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() - return map{"n":$n, "data": $read || $data} -}; - (:~ outline for $pdf as map()* :) declare function pdfbox:outline($pdf as item()) as map(*)*{ @@ -313,7 +275,7 @@ as map(*){ ) }; -(:~ PDF outline in xml format :) +(:~ outline as xml :) declare function pdfbox:outline-xml($pdf as item()) as element(outline)?{ let $outline:=pdfbox:outline($pdf) @@ -322,7 +284,6 @@ as element(outline)?{ else () }; -(:~ recursive ouutline map to XML :) declare %private function pdfbox:bookmark-xml($outline as map(*)*) as element(bookmark)* { @@ -332,8 +293,8 @@ as element(bookmark)* }; -(:~ return bookmark info for $bookmark -@return map{index:..,title:..,hasChildren:..} +(:~ return bookmark info for children of $outlineItem +@return map like{index:,title:,hasChildren:} :) declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) as map(*) @@ -359,11 +320,8 @@ as item()? =>PDPageTree:indexOf($page) }; -(:~ Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) -@param $start first page to include -@param $end last page to include -:) -declare function pdfbox:extract-range($pdf as item(), +(:~ new PDF doc from 1 based page range as xs:base64Binary :) +declare function pdfbox:extract($pdf as item(), $start as xs:integer,$end as xs:integer) as xs:base64Binary { @@ -397,17 +355,15 @@ as xs:string{ return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} }; -(:~ return size of $pageNo (zero based) -@result e.g. [0.0,0.0,168.0,239.52] - :) -declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) +(:~ return size of $pageNo zero based :) +declare function pdfbox:page-size($pdf as item(), $pageNo as xs:integer) as xs:string{ PDDocument:getPage($pdf, $pageNo) =>PDPage:getMediaBox() =>PDRectangle:toString() }; -(:~ version of Apache Pdfbox in use e.g. "3.0.4" :) +(:~ version of Apache Pdfbox in use e.g. "3.0.4" :) declare function pdfbox:version() as xs:string{ Q{java:org.apache.pdfbox.util.Version}getVersion() @@ -423,7 +379,7 @@ as xs:string?{ }; (:~ fn:do-until shim for BaseX 9+10 -if fn:do-until not found use hof:until, note: $pos always zero +if fn:do-until not found use hof:until :) declare %private function pdfbox:do-until( $input as item()*, diff --git a/tests/test.xqm b/tests/test.xqm index 61d9f91..0c12710 100644 --- a/tests/test.xqm +++ b/tests/test.xqm @@ -9,7 +9,7 @@ declare variable $test:base:=file:base-dir()=>file:parent(); declare %unit:test function test:pdfbox-version(){ let $v:= pdfbox:version()=>trace("VER: ") - return unit:assert-equals($v,"3.0.5") + return unit:assert-equals($v,"3.0.4") }; declare %unit:test @@ -22,7 +22,7 @@ function test:specification(){ declare %unit:test function test:page-count(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") - let $pages:=pdfbox:number-of-pages($pdf) + let $pages:=pdfbox:page-count($pdf) return unit:assert-equals($pages,521) }; @@ -53,7 +53,7 @@ function test:labels(){ let $labels:=pdfbox:labels($pdf) return ( - unit:assert-equals(count($labels),pdfbox:number-of-pages($pdf)), + unit:assert-equals(count($labels),pdfbox:page-count($pdf)), unit:assert($labels[1]="i") , unit:assert($labels[27]="1") ) @@ -63,7 +63,7 @@ declare %unit:test function test:extract(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") let $dest:=file:create-temp-file("test",".pdf")=>trace("DEST: ") - let $bin:=pdfbox:extract-range($pdf,2,12) + let $bin:=pdfbox:extract($pdf,2,12) return unit:assert(true()) }; @@ -77,7 +77,7 @@ let $pdf:=test:open("samples.pdf/BaseX100.pdf") declare %unit:test function test:page-image(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") - let $image:=pdfbox:page-render($pdf,0,map{}) + let $image:=pdfbox:page-image($pdf,0,map{}) return unit:assert(true()) }; @@ -94,7 +94,7 @@ declare %unit:test function test:with-url(){ let $url:="https://files.basex.org/publications/Gath%20et%20al.%20%5b2009%5d,%20INEX%20Efficiency%20Track%20meets%20XQuery%20Full%20Text%20in%20BaseX.pdf" - let $count:=pdfbox:with-pdf($url,pdfbox:number-of-pages#1) + let $count:=pdfbox:with-pdf($url,pdfbox:page-count#1) return unit:assert-equals($count,6) }; @@ -141,13 +141,13 @@ function test:property(){ declare %unit:test("expected", "pdfbox:property") function test:property-bad(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") - let $title:=pdfbox:property($pdf, "badname") + let $title:=pdfbox:property($pdf, "totle") return unit:assert(exists($title)) }; -(:~ Test for pdfbox:property-names function :) +(:~ Test for pdfbox:defined-properties function :) declare %unit:test function test:defined-properties(){ - let $properties:=pdfbox:property-names() + let $properties:=pdfbox:defined-properties() return unit:assert(exists($properties)) };