diff --git a/changelog.md b/changelog.md index 31cf537..c974278 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,9 @@ +## 0.2.4 2025-02-16 +* Add `property` +* rewrite `report` to return CSV style data +* replace `open-file` with `open` using `fetch:binary` to allow urls +* Mod `extract` returns xs:base64Binary +* password support ## 0.1.6 2025-02-14 * Add `hasLabels` * FIX #1 error if no labels diff --git a/package.json b/package.json index 6dba658..1a39c2f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdfbox", - "version": "0.2.2", + "version": "0.2.4", "description": "A BaseX interface to Apache Pdfbox version 3", "main": "src/Pdfbox3.xqm", "homepage": "https://github.com/expkg-zone58/pdfbox#readme", @@ -8,7 +8,7 @@ "doc": "docs" }, "scripts": { - "test": "%BASEX10%/bin/basex -t tests", + "test": "%BASEX10%/bin/basex -Wt tests", "docs": "xqdoca" }, "keywords": [ diff --git a/readme.md b/readme.md index 5fde2fa..cacd901 100644 --- a/readme.md +++ b/readme.md @@ -11,17 +11,23 @@ A test suite is available and workflow actions run this on BaseX 10.7 and 11.7. ## Features - -The features focus on extracting information from PDFs rather than creation or editing. - +The features focus on extracting information from PDFs rather than creation or editing of PDFs. +### Supported * read PDF page count. * read any PDF outline and return as map(s) or XML. * read pagelabels. * read page text. * save pdf page range to a new pdf. * save image of rendered pdf page. +* open PDF with password +* support for xs:base64Binary in function inputs and outputs to support database and store usage. -AI (Deepseek) generated [documentation](doc.md) +### Not supported: +* creating completely new PDFs +* Page size information + +## Documentation +* Function [documentation](doc.md) * The Apache Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful. # Install diff --git a/samples.pdf/page-numbers-password.pdf b/samples.pdf/page-numbers-password.pdf new file mode 100644 index 0000000..654e9a5 Binary files /dev/null and b/samples.pdf/page-numbers-password.pdf differ diff --git a/samples.pdf/readme.md b/samples.pdf/readme.md index 70bb392..76c6499 100644 --- a/samples.pdf/readme.md +++ b/samples.pdf/readme.md @@ -4,5 +4,6 @@ * [BaseX100.pdf](https://files.basex.org/releases/10.0/BaseX100.pdf) * [icelandic-dictionary.pdf](http://css4.pub/2015/icelandic/dictionary.pdf) * [page-numbers.pdf](https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers). +* [page-numbers-password.pdf](https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers). * [Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans](https://www.lse.ac.uk/News/News-Assets/PDFs/2021/Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans-Final-Report-November-2021.pdf) * [Legal RAG Hallucinations](https://law.stanford.edu/wp-content/uploads/2024/05/Legal_RAG_Hallucinations.pdf) diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index 057c852..1d5b03d 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -21,7 +21,7 @@ declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; -declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; +declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; declare namespace File ="java:java.io.File"; @@ -33,7 +33,7 @@ e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) declare function pdfbox:with-pdf($src as xs:string, $fn as function(item())as item()*) as item()*{ - let $pdf:=pdfbox:open-file($src) + let $pdf:=pdfbox:open($src) return try{ $fn($pdf),pdfbox:close($pdf) } catch *{ @@ -42,13 +42,22 @@ as item()*{ }; -(:~ open pdf, returns pdf object :) -declare function pdfbox:open-file($pdfpath as xs:string) + +(:~ open pdf using fetch:binary, returns pdf object :) +declare function pdfbox:open($pdfpath as xs:string) +as item(){ +pdfbox:open($pdfpath, map{}) +}; + +(:~ open pdf using with password option, returns pdf object :) +declare function pdfbox:open($pdfpath as xs:string, $opts as map(*)) as item(){ try{ - Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath)) + if($opts?password) + then Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath)),$opts?password) + else Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath))) } catch *{ - error(xs:QName("pdfbox:open-file"),"Failed to open: " || $pdfpath) + error(xs:QName("pdfbox:open"),"Failed to open: " || $pdfpath || " " || $err:description) } }; @@ -66,6 +75,15 @@ as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath }; +(:~ $pdf as xs:base64Binary :) +declare function pdfbox:binary($pdf as item()) +as xs:base64Binary{ + let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() + let $_:=PDDocument:save($pdf, $bytes) + return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) + =>convert:integers-to-base64() +}; + (: release references to $pdf:) declare function pdfbox:close($pdf as item()) as empty-sequence(){ @@ -150,7 +168,8 @@ as item()*{ else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }; -(:~ summary CSV style info for all properties for $pdfpaths :) +(:~ summary CSV style info for all properties for $pdfpaths +:) declare function pdfbox:report($pdfpaths as xs:string*) as map(*){ pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) @@ -162,7 +181,7 @@ as map(*){ map{"names": array{"path",$properties}, "records": for $path in $pdfpaths - let $pdf:=pdfbox:open-file($path) + let $pdf:=pdfbox:open($path) return fold-left($properties, array{$path}, function($result as array(*),$prop as xs:string){ @@ -203,12 +222,12 @@ as map(*)*{ (:~ return bookmark info for children of $outlineItem as seq of maps :) declare function pdfbox:outline($pdf as item(),$outlineItem as item()?) as map(*)*{ - let $find as map(*):=pdfbox:_outline($pdf ,$outlineItem) + let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) return map:get($find,"list") }; -(: BaseX bug 10.7? error if inlined in outline :) -declare %private function pdfbox:_outline($pdf as item(),$outlineItem as item()?) +(:~ BaseX bug 10.7? error if inlined in outline :) +declare %private function pdfbox:outline_($pdf as item(),$outlineItem as item()?) as map(*){ pdfbox:do-until( @@ -274,16 +293,13 @@ as item()? =>PDPageTree:indexOf($page) }; - - -(:~ save new PDF doc from 1 based page range -@return save path :) +(:~ new PDF doc from 1 based page range as xs:base64Binary :) declare function pdfbox:extract($pdf as item(), - $start as xs:integer,$end as xs:integer,$target as xs:string) -as xs:string + $start as xs:integer,$end as xs:integer) +as xs:base64Binary { let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() - return (pdfbox:save($a,$target),pdfbox:close($a)) + return (pdfbox:binary($a),pdfbox:close($a)) }; diff --git a/tests/test.xqm b/tests/test.xqm index 155e7d3..5c041bb 100644 --- a/tests/test.xqm +++ b/tests/test.xqm @@ -6,7 +6,6 @@ import module namespace pdfbox="org.expkg_zone58.Pdfbox3"; declare variable $test:base:=file:base-dir()=>file:parent(); - declare %unit:test function test:pdfbox-version(){ let $v:= pdfbox:version()=>trace("VER: ") @@ -61,10 +60,10 @@ function test:labels(){ }; declare %unit:test -function test:extract-save(){ +function test:extract(){ let $pdf:=test:open("samples.pdf/BaseX100.pdf") let $dest:=file:create-temp-file("test",".pdf")=>trace("DEST: ") - let $outline:=pdfbox:extract($pdf,2,12,$dest) + let $bin:=pdfbox:extract($pdf,2,12) return unit:assert(true()) }; @@ -82,6 +81,7 @@ function test:page-image(){ return unit:assert(true()) }; + declare %unit:test function test:with-pdf(){ let $path:=test:resolve("samples.pdf/BaseX100.pdf") @@ -89,11 +89,39 @@ function test:with-pdf(){ return unit:assert(starts-with($txt,"Options")) }; -declare function test:open($file as xs:string) -as item(){ - test:resolve($file)=>pdfbox:open-file() +(:~ get PDF from url :) +declare %unit:test +function test:with-url(){ + let $url:="https://files.basex.org/publications/Gath%20et%20al.%20%5b2009%5d,%20INEX%20Efficiency%20Track%20meets%20XQuery%20Full%20Text%20in%20BaseX.pdf" + + let $count:=pdfbox:with-pdf($url,pdfbox:page-count#1) + return unit:assert-equals($count,6) }; +(:~ password missing :) +declare %unit:test("expected", "pdfbox:open") +function test:password-bad(){ + let $pdf:=test:open("samples.pdf/page-numbers-password.pdf") + return unit:assert(true()) +}; + +(:~password good :) +declare %unit:test +function test:password-good(){ + let $pdf:=test:open("samples.pdf/page-numbers-password.pdf",map{"password":"password"}) + return unit:assert(true()) +}; + +(:---------------------------------------:) +declare function test:open($file as xs:string,$opts as map(*)) +as item(){ + test:resolve($file)=>pdfbox:open($opts) +}; + +declare function test:open($file as xs:string) +as item(){ + test:open($file,map{}) +}; declare function test:resolve($file as xs:string) as item(){ file:resolve-path($file,$test:base)