diff --git a/doc.md b/doc.md index 1b20032..d061fb2 100644 --- a/doc.md +++ b/doc.md @@ -78,10 +78,10 @@ let $text := pdfbox:page-text($pdf, 1) (: Extract text from page 1 :) --- ### Rendering a Page as an Image -You can render a PDF page as an image using the `pdfbox:page-image` function. Supported formats include `jpg`, `png`, `bmp`, and `gif`. +You can render a PDF page as an image using the `pdfbox:page-render` function. Supported formats include `jpg`, `png`, `bmp`, and `gif`. ```xquery -let $image := pdfbox:page-image($pdf, 1, map{"format": "png", "scale": 2}) +let $image := pdfbox:page-render($pdf, 1, map{"format": "png", "scale": 2}) ``` - `format`: The image format (default is `jpg`). @@ -90,10 +90,10 @@ let $image := pdfbox:page-image($pdf, 1, map{"format": "png", "scale": 2}) --- ### Extracting a Range of Pages -To extract a range of pages from a PDF, use the `pdfbox:extract` function. +To extract a range of pages from a PDF, use the `pdfbox:extract-range` function. ```xquery -let $extracted := pdfbox:extract($pdf, 1, 3) (: Extract pages 1 to 3 :) +let $extracted := pdfbox:extract-range($pdf, 1, 3) (: Extract pages 1 to 3 :) ``` The result is a new PDF document in binary format. diff --git a/package.json b/package.json index 2aebc76..c7b224e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdfbox", - "version": "0.2.7", + "version": "0.3.1", "description": "A BaseX interface to Apache Pdfbox version 3", "main": "src/Pdfbox3.xqm", "homepage": "https://github.com/expkg-zone58/pdfbox#readme", @@ -8,7 +8,7 @@ "doc": "docs" }, "scripts": { - "test": "%BASEX10%/bin/basex -Wt tests", + "test": "%BASEX10%/bin/basex -Wt tests", "docs": "xqdoca" }, "keywords": [ @@ -22,11 +22,13 @@ "expkg_zone58": { "namespace": "org.expkg_zone58.Pdfbox3", "main-class": "org.apache.pdfbox.pdmodel.PDDocument", + "manifest-jar" :"pdfbox-3.0.4.jar", + "output" : "dist/pdfbox-3.0.5.fat.jar", "maven2": [ - "org.apache.pdfbox:pdfbox:3.0.4", - "org.apache.pdfbox:pdfbox-io:3.0.4", - "org.apache.pdfbox:fontbox:3.0.4", - "commons-logging:commons-logging:1.3.4" + "org.apache.pdfbox:pdfbox:3.0.5", + "org.apache.pdfbox:pdfbox-io:3.0.5", + "org.apache.pdfbox:fontbox:3.0.5", + "commons-logging:commons-logging:1.3.5" ] } diff --git a/readme.md b/readme.md index 53c084d..309d7d1 100644 --- a/readme.md +++ b/readme.md @@ -41,7 +41,7 @@ import module namespace pdfbox="org.expkg_zone58.Pdfbox3"; pdfbox:with-pdf("...path/to/pdf.pdf", function($pdf){ - (1 to pdfbox:page-count($pdf))!pdfbox:page-text($pdf,.) + (1 to pdfbox:number-of-pages($pdf))!pdfbox:page-text($pdf,.) } ) ``` diff --git a/scripts/build.xqm b/scripts/build.xqm index 34e81a0..35c65c5 100644 --- a/scripts/build.xqm +++ b/scripts/build.xqm @@ -9,6 +9,8 @@ declare namespace pkg='http://expath.org/ns/pkg'; declare variable $build:archive-opts:= map { "format" : "zip", "algorithm" : "deflate" }; declare variable $build:base:= file:resolve-path("../",static-base-uri())=>trace("base "); + +(:~ load "npm style" package.json :) declare variable $build:PKG:=json:doc(file:resolve-path("package.json",$build:base),map{"format":"xquery"}); (:~ return binary for fat jar from jars in $input-dir diff --git a/scripts/make-fat-jar.xq b/scripts/make-fat-jar.xq index b327816..adc3212 100644 --- a/scripts/make-fat-jar.xq +++ b/scripts/make-fat-jar.xq @@ -3,29 +3,18 @@ import module namespace build = 'urn:quodatum:build1' at 'build.xqm'; declare variable $base:= file:resolve-path("../",static-base-uri())=>trace("base "); -declare variable $maven-urls := ( -"org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar", -"org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar", -"org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar", -"commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar" -); - -let $config :=map { - "manifest-jar" : "pdfbox-3.0.4.jar", - "input-dir" : "jars/", - "output" : "dist/pdfbox-3.0.4.fat.jar", - "main-class": "org.expkg_zone58.Pdfbox3" - } -let $jar-path:=file:resolve-path($config?input-dir,$base)=>trace("jar: ") -let $_:=build:maven-download($maven-urls,$jar-path) -let $fat-jar := build:fatjar-from-folder($jar-path,$config?manifest-jar) +let $jar-path:=$build:base || "jars/"=>trace("jar: ") + let $_:=build:maven-download($build:PKG?expkg_zone58?maven2=>array:flatten(), + $build:base || "jars/") -let $fat-jar:=build:update-manifest($fat-jar, $config?main-class) -let $name:=replace($config?main-class,"\.","/") || ".xqm" +let $fat-jar := build:fatjar-from-folder($jar-path,$build:PKG?expkg_zone58?manifest-jar) + +let $fat-jar:=build:update-manifest($fat-jar, $build:PKG?expkg_zone58?main-class) +let $name:=replace($build:PKG?expkg_zone58?main-class,"\.","/") || ".xqm" let $content:=file:read-binary($base || "src/Pdfbox3.xqm") let $fat-jar:=archive:update($fat-jar, $name,$content) -let $output-file := file:resolve-path($config?output,$base) +let $output-file := file:resolve-path($build:PKG?expkg_zone58?output,$base) return (build:write-binary($output-file, $fat-jar), trace($output-file,"fat jar: ")) \ No newline at end of file diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index dafdf7f..d9a41a8 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -1,10 +1,10 @@ xquery version '3.1'; (:~ -pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library, +A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.4.jar -@see download https://pdfbox.apache.org/download.cgi -@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.4/ +tested with pdfbox-app-3.0.5.jar +@see download +@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/ @author Andy Bunce 2025 :) @@ -16,7 +16,7 @@ declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; -declare namespace PDPage ="org.apache.pdfbox.pdmodel.PDPage"; +declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage"; declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline"; declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; @@ -54,8 +54,8 @@ pdfbox:open($pdfsrc, map{}) }; (:~ open pdf from file/url/binary, opts may have password , returns pdf object -@param $pdfsrc a fetchable url or a xs:base64Binary -@param $opts map {"password":} +@param $pdfsrc a fetchable url or filepath, or xs:base64Binary item +@param $opts options otionally with map {"password":} :) declare function pdfbox:open($pdfsrc as item(), $opts as map(*)) as item(){ @@ -75,7 +75,7 @@ as item(){ } }; -(:~ the version of the PDF specification used by $pdf e.g "1.4" +(:~ The version of the PDF specification used by $pdf e.g "1.4" returned as string to avoid float rounding issues :) declare function pdfbox:specification($pdf as item()) @@ -83,13 +83,13 @@ as xs:string{ PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() }; -(:~ save pdf $pdf to filesystem at $savepath , returns $savepath :) +(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) declare function pdfbox:save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath }; -(:~ $pdf as xs:base64Binary :) +(:~ Create binary representation of $pdf as xs:base64Binary :) declare function pdfbox:binary($pdf as item()) as xs:base64Binary{ let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() @@ -98,7 +98,7 @@ as xs:base64Binary{ =>convert:integers-to-base64() }; -(:~ release references to $pdf:) +(:~ Release any resources related to $pdf:) declare function pdfbox:close($pdf as item()) as empty-sequence(){ (# db:wrapjava void #) { @@ -106,15 +106,15 @@ as empty-sequence(){ } }; -(:~ number of pages in PDF:) -declare function pdfbox:page-count($pdf as item()) +(:~ Number of pages in PDF:) +declare function pdfbox:number-of-pages($pdf as item()) as xs:integer{ PDDocument:getNumberOfPages($pdf) }; -(:~ pdf page as image (zero is cover) +(:~ Pdf page as image (zero is cover) options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) -declare function pdfbox:page-image($pdf as item(),$pageNo as xs:integer,$options as map(*)) +declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) as xs:base64Binary{ let $options:=map:merge(($options,map{"format":"jpg","scale":1})) let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) @@ -130,7 +130,7 @@ as xs:base64Binary{ values are sequences of functions to get property from $pdf object :) declare %private variable $pdfbox:property-map:=map{ - "pageCount": pdfbox:page-count#1, + "pageCount": pdfbox:number-of-pages#1, "hasOutline": pdfbox:hasOutline#1, @@ -166,7 +166,7 @@ declare %private variable $pdfbox:property-map:=map{ }; (:~ known property names sorted :) -declare function pdfbox:defined-properties() +declare function pdfbox:property-names() as xs:string*{ $pdfbox:property-map=>map:keys()=>sort() }; @@ -275,7 +275,7 @@ as map(*){ ) }; -(:~ outline as xml :) +(:~ PDF outline in xml format :) declare function pdfbox:outline-xml($pdf as item()) as element(outline)?{ let $outline:=pdfbox:outline($pdf) @@ -294,8 +294,8 @@ as element(bookmark)* }; -(:~ return bookmark info for children of $outlineItem -@return map like{index:,title:,hasChildren:} +(:~ return bookmark info for $bookmark +@return map{index:..,title:..,hasChildren:..} :) declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) as map(*) @@ -321,8 +321,8 @@ as item()? =>PDPageTree:indexOf($page) }; -(:~ new PDF doc from 1 based page range as xs:base64Binary :) -declare function pdfbox:extract($pdf as item(), +(:~ Return new extract PDF doc as xs:base64Binary, using a 1 based page range :) +declare function pdfbox:extract-range($pdf as item(), $start as xs:integer,$end as xs:integer) as xs:base64Binary { diff --git a/tests/test.xqm b/tests/test.xqm index 0c12710..61d9f91 100644 --- a/tests/test.xqm +++ b/tests/test.xqm @@ -9,7 +9,7 @@ declare variable $test:base:=file:base-dir()=>file:parent(); declare %unit:test function test:pdfbox-version(){ let $v:= pdfbox:version()=>trace("VER: ") - return unit:assert-equals($v,"3.0.4") + return unit:assert-equals($v,"3.0.5") }; declare %unit:test @@ -22,7 +22,7 @@ function test:specification(){ declare %unit:test function test:page-count(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") - let $pages:=pdfbox:page-count($pdf) + let $pages:=pdfbox:number-of-pages($pdf) return unit:assert-equals($pages,521) }; @@ -53,7 +53,7 @@ function test:labels(){ let $labels:=pdfbox:labels($pdf) return ( - unit:assert-equals(count($labels),pdfbox:page-count($pdf)), + unit:assert-equals(count($labels),pdfbox:number-of-pages($pdf)), unit:assert($labels[1]="i") , unit:assert($labels[27]="1") ) @@ -63,7 +63,7 @@ declare %unit:test function test:extract(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") let $dest:=file:create-temp-file("test",".pdf")=>trace("DEST: ") - let $bin:=pdfbox:extract($pdf,2,12) + let $bin:=pdfbox:extract-range($pdf,2,12) return unit:assert(true()) }; @@ -77,7 +77,7 @@ let $pdf:=test:open("samples.pdf/BaseX100.pdf") declare %unit:test function test:page-image(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") - let $image:=pdfbox:page-image($pdf,0,map{}) + let $image:=pdfbox:page-render($pdf,0,map{}) return unit:assert(true()) }; @@ -94,7 +94,7 @@ declare %unit:test function test:with-url(){ let $url:="https://files.basex.org/publications/Gath%20et%20al.%20%5b2009%5d,%20INEX%20Efficiency%20Track%20meets%20XQuery%20Full%20Text%20in%20BaseX.pdf" - let $count:=pdfbox:with-pdf($url,pdfbox:page-count#1) + let $count:=pdfbox:with-pdf($url,pdfbox:number-of-pages#1) return unit:assert-equals($count,6) }; @@ -141,13 +141,13 @@ function test:property(){ declare %unit:test("expected", "pdfbox:property") function test:property-bad(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") - let $title:=pdfbox:property($pdf, "totle") + let $title:=pdfbox:property($pdf, "badname") return unit:assert(exists($title)) }; -(:~ Test for pdfbox:defined-properties function :) +(:~ Test for pdfbox:property-names function :) declare %unit:test function test:defined-properties(){ - let $properties:=pdfbox:defined-properties() + let $properties:=pdfbox:property-names() return unit:assert(exists($properties)) };