[mod] 0.3.6
This commit is contained in:
parent
a3bef9d6f4
commit
ed84a3b342
5 changed files with 64 additions and 14 deletions
|
@ -1,3 +1,9 @@
|
||||||
|
# 0.3.6 2025-05-31
|
||||||
|
* Add metadata function
|
||||||
|
* rename page-size->page-media-box
|
||||||
|
# 0.3.1 2025-05-28
|
||||||
|
* update to Apache pdfbox to 3.0.5
|
||||||
|
* API name changes e.g. page-count->number-of-pages
|
||||||
# 0.2.7 2025-02-18
|
# 0.2.7 2025-02-18
|
||||||
* reduce memory use
|
* reduce memory use
|
||||||
* add open from xs:base64Binary
|
* add open from xs:base64Binary
|
||||||
|
|
4
doc.md
4
doc.md
|
@ -162,10 +162,10 @@ let $labels := pdfbox:labels($pdf)
|
||||||
---
|
---
|
||||||
|
|
||||||
### Getting Page Size
|
### Getting Page Size
|
||||||
To get the size of a specific page, use the `pdfbox:page-size` function.
|
To get the size of a specific page, use the `pdfbox:page-media-box` function.
|
||||||
|
|
||||||
```xquery
|
```xquery
|
||||||
let $size := pdfbox:page-size($pdf, 1) (: Get size of page 1 :)
|
let $size := pdfbox:page-media-box($pdf, 1) (: Get size of page 1 :)
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "pdfbox",
|
"name": "pdfbox",
|
||||||
"version": "0.3.1",
|
"version": "0.3.6",
|
||||||
"description": "A BaseX interface to Apache Pdfbox version 3",
|
"description": "A BaseX interface to Apache Pdfbox version 3",
|
||||||
"main": "src/Pdfbox3.xqm",
|
"main": "src/Pdfbox3.xqm",
|
||||||
"homepage": "https://github.com/expkg-zone58/pdfbox#readme",
|
"homepage": "https://github.com/expkg-zone58/pdfbox#readme",
|
||||||
|
@ -9,7 +9,8 @@
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"test": "%BASEX10%/bin/basex -Wt tests",
|
"test": "%BASEX10%/bin/basex -Wt tests",
|
||||||
"docs": "xqdoca"
|
"docs": "xqdoca",
|
||||||
|
"build": "%BASEX10%/bin/basex scripts/make-xar.xq"
|
||||||
},
|
},
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"pdf",
|
"pdf",
|
||||||
|
@ -22,7 +23,7 @@
|
||||||
"expkg_zone58": {
|
"expkg_zone58": {
|
||||||
"namespace": "org.expkg_zone58.Pdfbox3",
|
"namespace": "org.expkg_zone58.Pdfbox3",
|
||||||
"main-class": "org.apache.pdfbox.pdmodel.PDDocument",
|
"main-class": "org.apache.pdfbox.pdmodel.PDDocument",
|
||||||
"manifest-jar" :"pdfbox-3.0.4.jar",
|
"manifest-jar" :"pdfbox-3.0.5.jar",
|
||||||
"output" : "dist/pdfbox-3.0.5.fat.jar",
|
"output" : "dist/pdfbox-3.0.5.fat.jar",
|
||||||
"maven2": [
|
"maven2": [
|
||||||
"org.apache.pdfbox:pdfbox:3.0.5",
|
"org.apache.pdfbox:pdfbox:3.0.5",
|
||||||
|
|
|
@ -20,12 +20,12 @@ The features focus on extracting information from PDFs rather than creation or e
|
||||||
* save pdf page range to a new pdf.
|
* save pdf page range to a new pdf.
|
||||||
* save image of rendered pdf page.
|
* save image of rendered pdf page.
|
||||||
* open PDF with password
|
* open PDF with password
|
||||||
|
* read XMP metadata
|
||||||
|
* Page size information
|
||||||
* support for xs:base64Binary in function inputs and outputs to facilitate database and store usage.
|
* support for xs:base64Binary in function inputs and outputs to facilitate database and store usage.
|
||||||
|
|
||||||
### Not supported:
|
### Not supported:
|
||||||
* creating completely new PDFs
|
* creating PDFs with new content
|
||||||
* Page size information
|
|
||||||
* XMP processing
|
|
||||||
* Form processing
|
* Form processing
|
||||||
|
|
||||||
## Documentation
|
## Documentation
|
||||||
|
|
|
@ -3,7 +3,7 @@ xquery version '3.1';
|
||||||
A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ ,
|
A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ ,
|
||||||
requires pdfbox jars on classpath, i.e. in custom or xar
|
requires pdfbox jars on classpath, i.e. in custom or xar
|
||||||
tested with pdfbox-app-3.0.5.jar
|
tested with pdfbox-app-3.0.5.jar
|
||||||
@see <a href="https://pdfbox.apache.org/download.cgi">download</a>
|
@see https://pdfbox.apache.org/download.cgi
|
||||||
@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/
|
@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/
|
||||||
@author Andy Bunce 2025
|
@author Andy Bunce 2025
|
||||||
:)
|
:)
|
||||||
|
@ -22,6 +22,12 @@ declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive
|
||||||
declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
|
declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
|
||||||
declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
|
declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
|
||||||
declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
|
declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
|
||||||
|
declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata";
|
||||||
|
declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream";
|
||||||
|
|
||||||
|
declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
|
||||||
|
|
||||||
|
|
||||||
declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer";
|
declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer";
|
||||||
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
|
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
|
||||||
declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle";
|
declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle";
|
||||||
|
@ -89,7 +95,7 @@ as xs:string{
|
||||||
PDDocument:save($pdf, File:new($savepath)),$savepath
|
PDDocument:save($pdf, File:new($savepath)),$savepath
|
||||||
};
|
};
|
||||||
|
|
||||||
(:~ Create binary representation of $pdf as xs:base64Binary :)
|
(:~ Create binary representation of $pdf object as xs:base64Binary :)
|
||||||
declare function pdfbox:binary($pdf as item())
|
declare function pdfbox:binary($pdf as item())
|
||||||
as xs:base64Binary{
|
as xs:base64Binary{
|
||||||
let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
|
let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
|
||||||
|
@ -125,6 +131,7 @@ as xs:base64Binary{
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
(:~ property access map
|
(:~ property access map
|
||||||
keys are property names,
|
keys are property names,
|
||||||
values are sequences of functions to get property from $pdf object
|
values are sequences of functions to get property from $pdf object
|
||||||
|
@ -233,6 +240,37 @@ as xs:boolean{
|
||||||
=>exists()
|
=>exists()
|
||||||
};
|
};
|
||||||
|
|
||||||
|
(:~ XMP metadata as "RDF" document
|
||||||
|
@note usually rdf:RDF root, but sometimes x:xmpmeta
|
||||||
|
:)
|
||||||
|
declare function pdfbox:metadata($pdf as item())
|
||||||
|
as document-node(element(*))?
|
||||||
|
{
|
||||||
|
let $m:=PDDocument:getDocumentCatalog($pdf)
|
||||||
|
=>PDDocumentCatalog:getMetadata()
|
||||||
|
return if(exists($m))
|
||||||
|
then
|
||||||
|
let $is:=PDMetadata:exportXMPMetadata($m)
|
||||||
|
return pdfbox:do-until(
|
||||||
|
map{"n":0,"data":""},
|
||||||
|
|
||||||
|
function($input,$pos ) { pdfbox:read-stream($is,$input?data)},
|
||||||
|
|
||||||
|
function($output,$pos) { $output?n eq -1 }
|
||||||
|
)?data=>parse-xml()
|
||||||
|
else ()
|
||||||
|
};
|
||||||
|
|
||||||
|
(:~ read next block from XMP stream :)
|
||||||
|
declare %private function pdfbox:read-stream($is,$read as xs:string)
|
||||||
|
as map(*){
|
||||||
|
let $blen:=4096
|
||||||
|
let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen)
|
||||||
|
let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen))
|
||||||
|
let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string()
|
||||||
|
return map{"n":$n, "data": $read || $data}
|
||||||
|
};
|
||||||
|
|
||||||
(:~ outline for $pdf as map()* :)
|
(:~ outline for $pdf as map()* :)
|
||||||
declare function pdfbox:outline($pdf as item())
|
declare function pdfbox:outline($pdf as item())
|
||||||
as map(*)*{
|
as map(*)*{
|
||||||
|
@ -321,7 +359,10 @@ as item()?
|
||||||
=>PDPageTree:indexOf($page)
|
=>PDPageTree:indexOf($page)
|
||||||
};
|
};
|
||||||
|
|
||||||
(:~ Return new extract PDF doc as xs:base64Binary, using a 1 based page range :)
|
(:~ Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based)
|
||||||
|
@param $start first page to include
|
||||||
|
@param $end last page to include
|
||||||
|
:)
|
||||||
declare function pdfbox:extract-range($pdf as item(),
|
declare function pdfbox:extract-range($pdf as item(),
|
||||||
$start as xs:integer,$end as xs:integer)
|
$start as xs:integer,$end as xs:integer)
|
||||||
as xs:base64Binary
|
as xs:base64Binary
|
||||||
|
@ -356,8 +397,10 @@ as xs:string{
|
||||||
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
|
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
|
||||||
};
|
};
|
||||||
|
|
||||||
(:~ return size of $pageNo (zero is cover :)
|
(:~ return size of $pageNo (zero based)
|
||||||
declare function pdfbox:page-size($pdf as item(), $pageNo as xs:integer)
|
@result e.g. [0.0,0.0,168.0,239.52]
|
||||||
|
:)
|
||||||
|
declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer)
|
||||||
as xs:string{
|
as xs:string{
|
||||||
PDDocument:getPage($pdf, $pageNo)
|
PDDocument:getPage($pdf, $pageNo)
|
||||||
=>PDPage:getMediaBox()
|
=>PDPage:getMediaBox()
|
||||||
|
@ -380,7 +423,7 @@ as xs:string?{
|
||||||
};
|
};
|
||||||
|
|
||||||
(:~ fn:do-until shim for BaseX 9+10
|
(:~ fn:do-until shim for BaseX 9+10
|
||||||
if fn:do-until not found use hof:until
|
if fn:do-until not found use hof:until, note: $pos always zero
|
||||||
:)
|
:)
|
||||||
declare %private function pdfbox:do-until(
|
declare %private function pdfbox:do-until(
|
||||||
$input as item()*,
|
$input as item()*,
|
||||||
|
|
Loading…
Add table
Reference in a new issue