[mod] 0.2.4
This commit is contained in:
parent
48693f36c6
commit
d71c016f05
7 changed files with 87 additions and 30 deletions
|
@ -1,3 +1,9 @@
|
|||
## 0.2.4 2025-02-16
|
||||
* Add `property`
|
||||
* rewrite `report` to return CSV style data
|
||||
* replace `open-file` with `open` using `fetch:binary` to allow urls
|
||||
* Mod `extract` returns xs:base64Binary
|
||||
* password support
|
||||
## 0.1.6 2025-02-14
|
||||
* Add `hasLabels`
|
||||
* FIX #1 error if no labels
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "pdfbox",
|
||||
"version": "0.2.2",
|
||||
"version": "0.2.4",
|
||||
"description": "A BaseX interface to Apache Pdfbox version 3",
|
||||
"main": "src/Pdfbox3.xqm",
|
||||
"homepage": "https://github.com/expkg-zone58/pdfbox#readme",
|
||||
|
@ -8,7 +8,7 @@
|
|||
"doc": "docs"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "%BASEX10%/bin/basex -t tests",
|
||||
"test": "%BASEX10%/bin/basex -Wt tests",
|
||||
"docs": "xqdoca"
|
||||
},
|
||||
"keywords": [
|
||||
|
|
14
readme.md
14
readme.md
|
@ -11,17 +11,23 @@ A test suite is available and workflow actions run this on BaseX 10.7 and 11.7.
|
|||
|
||||
## Features
|
||||
|
||||
|
||||
The features focus on extracting information from PDFs rather than creation or editing.
|
||||
|
||||
The features focus on extracting information from PDFs rather than creation or editing of PDFs.
|
||||
### Supported
|
||||
* read PDF page count.
|
||||
* read any PDF outline and return as map(s) or XML.
|
||||
* read pagelabels.
|
||||
* read page text.
|
||||
* save pdf page range to a new pdf.
|
||||
* save image of rendered pdf page.
|
||||
* open PDF with password
|
||||
* support for xs:base64Binary in function inputs and outputs to support database and store usage.
|
||||
|
||||
AI (Deepseek) generated [documentation](doc.md)
|
||||
### Not supported:
|
||||
* creating completely new PDFs
|
||||
* Page size information
|
||||
|
||||
## Documentation
|
||||
* Function [documentation](doc.md)
|
||||
* The Apache Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful.
|
||||
|
||||
# Install
|
||||
|
|
BIN
samples.pdf/page-numbers-password.pdf
Normal file
BIN
samples.pdf/page-numbers-password.pdf
Normal file
Binary file not shown.
|
@ -4,5 +4,6 @@
|
|||
* [BaseX100.pdf](https://files.basex.org/releases/10.0/BaseX100.pdf)
|
||||
* [icelandic-dictionary.pdf](http://css4.pub/2015/icelandic/dictionary.pdf)
|
||||
* [page-numbers.pdf](https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers).
|
||||
* [page-numbers-password.pdf](https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers).
|
||||
* [Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans](https://www.lse.ac.uk/News/News-Assets/PDFs/2021/Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans-Final-Report-November-2021.pdf)
|
||||
* [Legal RAG Hallucinations](https://law.stanford.edu/wp-content/uploads/2024/05/Legal_RAG_Hallucinations.pdf)
|
||||
|
|
|
@ -21,7 +21,7 @@ declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive
|
|||
declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
|
||||
declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
|
||||
declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
|
||||
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
|
||||
declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer";
|
||||
declare namespace File ="java:java.io.File";
|
||||
|
||||
|
||||
|
@ -33,7 +33,7 @@ e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
|
|||
declare function pdfbox:with-pdf($src as xs:string,
|
||||
$fn as function(item())as item()*)
|
||||
as item()*{
|
||||
let $pdf:=pdfbox:open-file($src)
|
||||
let $pdf:=pdfbox:open($src)
|
||||
return try{
|
||||
$fn($pdf),pdfbox:close($pdf)
|
||||
} catch *{
|
||||
|
@ -42,13 +42,22 @@ as item()*{
|
|||
|
||||
};
|
||||
|
||||
(:~ open pdf, returns pdf object :)
|
||||
declare function pdfbox:open-file($pdfpath as xs:string)
|
||||
|
||||
(:~ open pdf using fetch:binary, returns pdf object :)
|
||||
declare function pdfbox:open($pdfpath as xs:string)
|
||||
as item(){
|
||||
pdfbox:open($pdfpath, map{})
|
||||
};
|
||||
|
||||
(:~ open pdf using with password option, returns pdf object :)
|
||||
declare function pdfbox:open($pdfpath as xs:string, $opts as map(*))
|
||||
as item(){
|
||||
try{
|
||||
Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath))
|
||||
if($opts?password)
|
||||
then Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath)),$opts?password)
|
||||
else Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath)))
|
||||
} catch *{
|
||||
error(xs:QName("pdfbox:open-file"),"Failed to open: " || $pdfpath)
|
||||
error(xs:QName("pdfbox:open"),"Failed to open: " || $pdfpath || " " || $err:description)
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -66,6 +75,15 @@ as xs:string{
|
|||
PDDocument:save($pdf, File:new($savepath)),$savepath
|
||||
};
|
||||
|
||||
(:~ $pdf as xs:base64Binary :)
|
||||
declare function pdfbox:binary($pdf as item())
|
||||
as xs:base64Binary{
|
||||
let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
|
||||
let $_:=PDDocument:save($pdf, $bytes)
|
||||
return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
|
||||
=>convert:integers-to-base64()
|
||||
};
|
||||
|
||||
(: release references to $pdf:)
|
||||
declare function pdfbox:close($pdf as item())
|
||||
as empty-sequence(){
|
||||
|
@ -150,7 +168,8 @@ as item()*{
|
|||
else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined."))
|
||||
};
|
||||
|
||||
(:~ summary CSV style info for all properties for $pdfpaths :)
|
||||
(:~ summary CSV style info for all properties for $pdfpaths
|
||||
:)
|
||||
declare function pdfbox:report($pdfpaths as xs:string*)
|
||||
as map(*){
|
||||
pdfbox:report($pdfpaths,map:keys($pdfbox:property-map))
|
||||
|
@ -162,7 +181,7 @@ as map(*){
|
|||
map{"names": array{"path",$properties},
|
||||
|
||||
"records": for $path in $pdfpaths
|
||||
let $pdf:=pdfbox:open-file($path)
|
||||
let $pdf:=pdfbox:open($path)
|
||||
return fold-left($properties,
|
||||
array{$path},
|
||||
function($result as array(*),$prop as xs:string){
|
||||
|
@ -203,12 +222,12 @@ as map(*)*{
|
|||
(:~ return bookmark info for children of $outlineItem as seq of maps :)
|
||||
declare function pdfbox:outline($pdf as item(),$outlineItem as item()?)
|
||||
as map(*)*{
|
||||
let $find as map(*):=pdfbox:_outline($pdf ,$outlineItem)
|
||||
let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem)
|
||||
return map:get($find,"list")
|
||||
};
|
||||
|
||||
(: BaseX bug 10.7? error if inlined in outline :)
|
||||
declare %private function pdfbox:_outline($pdf as item(),$outlineItem as item()?)
|
||||
(:~ BaseX bug 10.7? error if inlined in outline :)
|
||||
declare %private function pdfbox:outline_($pdf as item(),$outlineItem as item()?)
|
||||
as map(*){
|
||||
pdfbox:do-until(
|
||||
|
||||
|
@ -274,16 +293,13 @@ as item()?
|
|||
=>PDPageTree:indexOf($page)
|
||||
};
|
||||
|
||||
|
||||
|
||||
(:~ save new PDF doc from 1 based page range
|
||||
@return save path :)
|
||||
(:~ new PDF doc from 1 based page range as xs:base64Binary :)
|
||||
declare function pdfbox:extract($pdf as item(),
|
||||
$start as xs:integer,$end as xs:integer,$target as xs:string)
|
||||
as xs:string
|
||||
$start as xs:integer,$end as xs:integer)
|
||||
as xs:base64Binary
|
||||
{
|
||||
let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract()
|
||||
return (pdfbox:save($a,$target),pdfbox:close($a))
|
||||
return (pdfbox:binary($a),pdfbox:close($a))
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,6 @@ import module namespace pdfbox="org.expkg_zone58.Pdfbox3";
|
|||
|
||||
declare variable $test:base:=file:base-dir()=>file:parent();
|
||||
|
||||
|
||||
declare %unit:test
|
||||
function test:pdfbox-version(){
|
||||
let $v:= pdfbox:version()=>trace("VER: ")
|
||||
|
@ -61,10 +60,10 @@ function test:labels(){
|
|||
};
|
||||
|
||||
declare %unit:test
|
||||
function test:extract-save(){
|
||||
function test:extract(){
|
||||
let $pdf:=test:open("samples.pdf/BaseX100.pdf")
|
||||
let $dest:=file:create-temp-file("test",".pdf")=>trace("DEST: ")
|
||||
let $outline:=pdfbox:extract($pdf,2,12,$dest)
|
||||
let $bin:=pdfbox:extract($pdf,2,12)
|
||||
return unit:assert(true())
|
||||
};
|
||||
|
||||
|
@ -82,6 +81,7 @@ function test:page-image(){
|
|||
return unit:assert(true())
|
||||
};
|
||||
|
||||
|
||||
declare %unit:test
|
||||
function test:with-pdf(){
|
||||
let $path:=test:resolve("samples.pdf/BaseX100.pdf")
|
||||
|
@ -89,11 +89,39 @@ function test:with-pdf(){
|
|||
return unit:assert(starts-with($txt,"Options"))
|
||||
};
|
||||
|
||||
declare function test:open($file as xs:string)
|
||||
as item(){
|
||||
test:resolve($file)=>pdfbox:open-file()
|
||||
(:~ get PDF from url :)
|
||||
declare %unit:test
|
||||
function test:with-url(){
|
||||
let $url:="https://files.basex.org/publications/Gath%20et%20al.%20%5b2009%5d,%20INEX%20Efficiency%20Track%20meets%20XQuery%20Full%20Text%20in%20BaseX.pdf"
|
||||
|
||||
let $count:=pdfbox:with-pdf($url,pdfbox:page-count#1)
|
||||
return unit:assert-equals($count,6)
|
||||
};
|
||||
|
||||
(:~ password missing :)
|
||||
declare %unit:test("expected", "pdfbox:open")
|
||||
function test:password-bad(){
|
||||
let $pdf:=test:open("samples.pdf/page-numbers-password.pdf")
|
||||
return unit:assert(true())
|
||||
};
|
||||
|
||||
(:~password good :)
|
||||
declare %unit:test
|
||||
function test:password-good(){
|
||||
let $pdf:=test:open("samples.pdf/page-numbers-password.pdf",map{"password":"password"})
|
||||
return unit:assert(true())
|
||||
};
|
||||
|
||||
(:---------------------------------------:)
|
||||
declare function test:open($file as xs:string,$opts as map(*))
|
||||
as item(){
|
||||
test:resolve($file)=>pdfbox:open($opts)
|
||||
};
|
||||
|
||||
declare function test:open($file as xs:string)
|
||||
as item(){
|
||||
test:open($file,map{})
|
||||
};
|
||||
declare function test:resolve($file as xs:string)
|
||||
as item(){
|
||||
file:resolve-path($file,$test:base)
|
||||
|
|
Loading…
Add table
Reference in a new issue