1
0
Fork 0

[mod] 0.2.4

This commit is contained in:
Andy Bunce 2025-02-16 22:31:13 +00:00
parent 48693f36c6
commit d71c016f05
7 changed files with 87 additions and 30 deletions

View file

@ -1,3 +1,9 @@
## 0.2.4 2025-02-16
* Add `property`
* rewrite `report` to return CSV style data
* replace `open-file` with `open` using `fetch:binary` to allow urls
* Mod `extract` returns xs:base64Binary
* password support
## 0.1.6 2025-02-14
* Add `hasLabels`
* FIX #1 error if no labels

View file

@ -1,6 +1,6 @@
{
"name": "pdfbox",
"version": "0.2.2",
"version": "0.2.4",
"description": "A BaseX interface to Apache Pdfbox version 3",
"main": "src/Pdfbox3.xqm",
"homepage": "https://github.com/expkg-zone58/pdfbox#readme",
@ -8,7 +8,7 @@
"doc": "docs"
},
"scripts": {
"test": "%BASEX10%/bin/basex -t tests",
"test": "%BASEX10%/bin/basex -Wt tests",
"docs": "xqdoca"
},
"keywords": [

View file

@ -11,17 +11,23 @@ A test suite is available and workflow actions run this on BaseX 10.7 and 11.7.
## Features
The features focus on extracting information from PDFs rather than creation or editing.
The features focus on extracting information from PDFs rather than creation or editing of PDFs.
### Supported
* read PDF page count.
* read any PDF outline and return as map(s) or XML.
* read pagelabels.
* read page text.
* save pdf page range to a new pdf.
* save image of rendered pdf page.
* open PDF with password
* support for xs:base64Binary in function inputs and outputs to support database and store usage.
AI (Deepseek) generated [documentation](doc.md)
### Not supported:
* creating completely new PDFs
* Page size information
## Documentation
* Function [documentation](doc.md)
* The Apache Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful.
# Install

Binary file not shown.

View file

@ -4,5 +4,6 @@
* [BaseX100.pdf](https://files.basex.org/releases/10.0/BaseX100.pdf)
* [icelandic-dictionary.pdf](http://css4.pub/2015/icelandic/dictionary.pdf)
* [page-numbers.pdf](https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers).
* [page-numbers-password.pdf](https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers).
* [Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans](https://www.lse.ac.uk/News/News-Assets/PDFs/2021/Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans-Final-Report-November-2021.pdf)
* [Legal RAG Hallucinations](https://law.stanford.edu/wp-content/uploads/2024/05/Legal_RAG_Hallucinations.pdf)

View file

@ -21,7 +21,7 @@ declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive
declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer";
declare namespace File ="java:java.io.File";
@ -33,7 +33,7 @@ e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
declare function pdfbox:with-pdf($src as xs:string,
$fn as function(item())as item()*)
as item()*{
let $pdf:=pdfbox:open-file($src)
let $pdf:=pdfbox:open($src)
return try{
$fn($pdf),pdfbox:close($pdf)
} catch *{
@ -42,13 +42,22 @@ as item()*{
};
(:~ open pdf, returns pdf object :)
declare function pdfbox:open-file($pdfpath as xs:string)
(:~ open pdf using fetch:binary, returns pdf object :)
declare function pdfbox:open($pdfpath as xs:string)
as item(){
pdfbox:open($pdfpath, map{})
};
(:~ open pdf using with password option, returns pdf object :)
declare function pdfbox:open($pdfpath as xs:string, $opts as map(*))
as item(){
try{
Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath))
if($opts?password)
then Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath)),$opts?password)
else Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath)))
} catch *{
error(xs:QName("pdfbox:open-file"),"Failed to open: " || $pdfpath)
error(xs:QName("pdfbox:open"),"Failed to open: " || $pdfpath || " " || $err:description)
}
};
@ -66,6 +75,15 @@ as xs:string{
PDDocument:save($pdf, File:new($savepath)),$savepath
};
(:~ $pdf as xs:base64Binary :)
declare function pdfbox:binary($pdf as item())
as xs:base64Binary{
let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
let $_:=PDDocument:save($pdf, $bytes)
return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
=>convert:integers-to-base64()
};
(: release references to $pdf:)
declare function pdfbox:close($pdf as item())
as empty-sequence(){
@ -150,7 +168,8 @@ as item()*{
else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined."))
};
(:~ summary CSV style info for all properties for $pdfpaths :)
(:~ summary CSV style info for all properties for $pdfpaths
:)
declare function pdfbox:report($pdfpaths as xs:string*)
as map(*){
pdfbox:report($pdfpaths,map:keys($pdfbox:property-map))
@ -162,7 +181,7 @@ as map(*){
map{"names": array{"path",$properties},
"records": for $path in $pdfpaths
let $pdf:=pdfbox:open-file($path)
let $pdf:=pdfbox:open($path)
return fold-left($properties,
array{$path},
function($result as array(*),$prop as xs:string){
@ -203,12 +222,12 @@ as map(*)*{
(:~ return bookmark info for children of $outlineItem as seq of maps :)
declare function pdfbox:outline($pdf as item(),$outlineItem as item()?)
as map(*)*{
let $find as map(*):=pdfbox:_outline($pdf ,$outlineItem)
let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem)
return map:get($find,"list")
};
(: BaseX bug 10.7? error if inlined in outline :)
declare %private function pdfbox:_outline($pdf as item(),$outlineItem as item()?)
(:~ BaseX bug 10.7? error if inlined in outline :)
declare %private function pdfbox:outline_($pdf as item(),$outlineItem as item()?)
as map(*){
pdfbox:do-until(
@ -274,16 +293,13 @@ as item()?
=>PDPageTree:indexOf($page)
};
(:~ save new PDF doc from 1 based page range
@return save path :)
(:~ new PDF doc from 1 based page range as xs:base64Binary :)
declare function pdfbox:extract($pdf as item(),
$start as xs:integer,$end as xs:integer,$target as xs:string)
as xs:string
$start as xs:integer,$end as xs:integer)
as xs:base64Binary
{
let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract()
return (pdfbox:save($a,$target),pdfbox:close($a))
return (pdfbox:binary($a),pdfbox:close($a))
};

View file

@ -6,7 +6,6 @@ import module namespace pdfbox="org.expkg_zone58.Pdfbox3";
declare variable $test:base:=file:base-dir()=>file:parent();
declare %unit:test
function test:pdfbox-version(){
let $v:= pdfbox:version()=>trace("VER: ")
@ -61,10 +60,10 @@ function test:labels(){
};
declare %unit:test
function test:extract-save(){
function test:extract(){
let $pdf:=test:open("samples.pdf/BaseX100.pdf")
let $dest:=file:create-temp-file("test",".pdf")=>trace("DEST: ")
let $outline:=pdfbox:extract($pdf,2,12,$dest)
let $bin:=pdfbox:extract($pdf,2,12)
return unit:assert(true())
};
@ -82,6 +81,7 @@ function test:page-image(){
return unit:assert(true())
};
declare %unit:test
function test:with-pdf(){
let $path:=test:resolve("samples.pdf/BaseX100.pdf")
@ -89,11 +89,39 @@ function test:with-pdf(){
return unit:assert(starts-with($txt,"Options"))
};
declare function test:open($file as xs:string)
as item(){
test:resolve($file)=>pdfbox:open-file()
(:~ get PDF from url :)
declare %unit:test
function test:with-url(){
let $url:="https://files.basex.org/publications/Gath%20et%20al.%20%5b2009%5d,%20INEX%20Efficiency%20Track%20meets%20XQuery%20Full%20Text%20in%20BaseX.pdf"
let $count:=pdfbox:with-pdf($url,pdfbox:page-count#1)
return unit:assert-equals($count,6)
};
(:~ password missing :)
declare %unit:test("expected", "pdfbox:open")
function test:password-bad(){
let $pdf:=test:open("samples.pdf/page-numbers-password.pdf")
return unit:assert(true())
};
(:~password good :)
declare %unit:test
function test:password-good(){
let $pdf:=test:open("samples.pdf/page-numbers-password.pdf",map{"password":"password"})
return unit:assert(true())
};
(:---------------------------------------:)
declare function test:open($file as xs:string,$opts as map(*))
as item(){
test:resolve($file)=>pdfbox:open($opts)
};
declare function test:open($file as xs:string)
as item(){
test:open($file,map{})
};
declare function test:resolve($file as xs:string)
as item(){
file:resolve-path($file,$test:base)