[mod] 0.3.6
This commit is contained in:
		
							parent
							
								
									a3bef9d6f4
								
							
						
					
					
						commit
						ed84a3b342
					
				
					 5 changed files with 64 additions and 14 deletions
				
			
		| 
						 | 
				
			
			@ -1,3 +1,9 @@
 | 
			
		|||
# 0.3.6 2025-05-31
 | 
			
		||||
* Add metadata function
 | 
			
		||||
* rename page-size->page-media-box
 | 
			
		||||
# 0.3.1 2025-05-28
 | 
			
		||||
* update to Apache pdfbox to 3.0.5
 | 
			
		||||
* API name changes e.g. page-count->number-of-pages
 | 
			
		||||
# 0.2.7 2025-02-18
 | 
			
		||||
* reduce memory use
 | 
			
		||||
* add open from xs:base64Binary
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										4
									
								
								doc.md
									
										
									
									
									
								
							
							
						
						
									
										4
									
								
								doc.md
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -162,10 +162,10 @@ let $labels := pdfbox:labels($pdf)
 | 
			
		|||
---
 | 
			
		||||
 | 
			
		||||
### Getting Page Size
 | 
			
		||||
To get the size of a specific page, use the `pdfbox:page-size` function.
 | 
			
		||||
To get the size of a specific page, use the `pdfbox:page-media-box` function.
 | 
			
		||||
 | 
			
		||||
```xquery
 | 
			
		||||
let $size := pdfbox:page-size($pdf, 1)  (: Get size of page 1 :)
 | 
			
		||||
let $size := pdfbox:page-media-box($pdf, 1)  (: Get size of page 1 :)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
---
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,6 @@
 | 
			
		|||
{
 | 
			
		||||
  "name": "pdfbox",
 | 
			
		||||
  "version": "0.3.1",
 | 
			
		||||
  "version": "0.3.6",
 | 
			
		||||
  "description": "A BaseX interface to Apache Pdfbox version 3",
 | 
			
		||||
  "main": "src/Pdfbox3.xqm",
 | 
			
		||||
  "homepage": "https://github.com/expkg-zone58/pdfbox#readme",
 | 
			
		||||
| 
						 | 
				
			
			@ -9,7 +9,8 @@
 | 
			
		|||
  },
 | 
			
		||||
  "scripts": {
 | 
			
		||||
    "test": "%BASEX10%/bin/basex  -Wt tests",
 | 
			
		||||
    "docs": "xqdoca"
 | 
			
		||||
    "docs": "xqdoca",
 | 
			
		||||
    "build": "%BASEX10%/bin/basex scripts/make-xar.xq"
 | 
			
		||||
  },
 | 
			
		||||
  "keywords": [
 | 
			
		||||
    "pdf",
 | 
			
		||||
| 
						 | 
				
			
			@ -22,7 +23,7 @@
 | 
			
		|||
  "expkg_zone58": {
 | 
			
		||||
    "namespace": "org.expkg_zone58.Pdfbox3",
 | 
			
		||||
    "main-class": "org.apache.pdfbox.pdmodel.PDDocument",
 | 
			
		||||
    "manifest-jar" :"pdfbox-3.0.4.jar",
 | 
			
		||||
    "manifest-jar" :"pdfbox-3.0.5.jar",
 | 
			
		||||
    "output" :  "dist/pdfbox-3.0.5.fat.jar",
 | 
			
		||||
    "maven2": [
 | 
			
		||||
      "org.apache.pdfbox:pdfbox:3.0.5",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -20,12 +20,12 @@ The features focus on extracting information from PDFs rather than creation or e
 | 
			
		|||
* save pdf page range to a new pdf.
 | 
			
		||||
* save image of rendered pdf page.
 | 
			
		||||
* open PDF with password
 | 
			
		||||
* read XMP metadata 
 | 
			
		||||
* Page size information
 | 
			
		||||
* support for xs:base64Binary in function inputs and outputs to facilitate database and store usage.
 | 
			
		||||
 | 
			
		||||
### Not supported:
 | 
			
		||||
* creating completely new PDFs
 | 
			
		||||
* Page size information
 | 
			
		||||
* XMP processing
 | 
			
		||||
* creating PDFs with new content
 | 
			
		||||
* Form processing
 | 
			
		||||
 | 
			
		||||
## Documentation
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,7 +3,7 @@ xquery version '3.1';
 | 
			
		|||
A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , 
 | 
			
		||||
requires pdfbox jars on classpath, i.e. in custom or xar
 | 
			
		||||
tested with pdfbox-app-3.0.5.jar
 | 
			
		||||
@see <a href="https://pdfbox.apache.org/download.cgi">download</a>
 | 
			
		||||
@see https://pdfbox.apache.org/download.cgi
 | 
			
		||||
@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/
 | 
			
		||||
@author Andy Bunce 2025
 | 
			
		||||
:)
 | 
			
		||||
| 
						 | 
				
			
			@ -22,6 +22,12 @@ declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive
 | 
			
		|||
declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
 | 
			
		||||
declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
 | 
			
		||||
declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
 | 
			
		||||
declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata";
 | 
			
		||||
declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream";
 | 
			
		||||
 | 
			
		||||
declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer";
 | 
			
		||||
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
 | 
			
		||||
declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle";
 | 
			
		||||
| 
						 | 
				
			
			@ -89,7 +95,7 @@ as xs:string{
 | 
			
		|||
   PDDocument:save($pdf, File:new($savepath)),$savepath
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ Create binary representation of $pdf as xs:base64Binary :)
 | 
			
		||||
(:~ Create binary representation of $pdf object as xs:base64Binary :)
 | 
			
		||||
declare function pdfbox:binary($pdf as item())
 | 
			
		||||
as xs:base64Binary{
 | 
			
		||||
   let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
 | 
			
		||||
| 
						 | 
				
			
			@ -125,6 +131,7 @@ as xs:base64Binary{
 | 
			
		|||
 
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
(:~ property access map
 | 
			
		||||
   keys are property names, 
 | 
			
		||||
   values are sequences of functions to get property from $pdf object
 | 
			
		||||
| 
						 | 
				
			
			@ -233,6 +240,37 @@ as xs:boolean{
 | 
			
		|||
  =>exists()
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ XMP metadata as "RDF" document
 | 
			
		||||
@note usually rdf:RDF root, but sometimes x:xmpmeta 
 | 
			
		||||
:)
 | 
			
		||||
declare function pdfbox:metadata($pdf as item())
 | 
			
		||||
as document-node(element(*))?
 | 
			
		||||
{
 | 
			
		||||
  let $m:=PDDocument:getDocumentCatalog($pdf)
 | 
			
		||||
         =>PDDocumentCatalog:getMetadata()
 | 
			
		||||
  return  if(exists($m))
 | 
			
		||||
          then 
 | 
			
		||||
              let $is:=PDMetadata:exportXMPMetadata($m)
 | 
			
		||||
              return pdfbox:do-until(
 | 
			
		||||
                        map{"n":0,"data":""},
 | 
			
		||||
 | 
			
		||||
                        function($input,$pos ) {  pdfbox:read-stream($is,$input?data)},
 | 
			
		||||
 | 
			
		||||
                        function($output,$pos) { $output?n eq -1 }     
 | 
			
		||||
                     )?data=>parse-xml()
 | 
			
		||||
          else ()
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ read next block from XMP stream :)
 | 
			
		||||
declare %private function pdfbox:read-stream($is,$read as xs:string)
 | 
			
		||||
as map(*){
 | 
			
		||||
  let $blen:=4096
 | 
			
		||||
  let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen)
 | 
			
		||||
  let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen))
 | 
			
		||||
  let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string()
 | 
			
		||||
  return map{"n":$n, "data": $read || $data}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ outline for $pdf as map()* :)
 | 
			
		||||
declare function pdfbox:outline($pdf as item())
 | 
			
		||||
as map(*)*{
 | 
			
		||||
| 
						 | 
				
			
			@ -321,7 +359,10 @@ as item()?
 | 
			
		|||
      =>PDPageTree:indexOf($page)
 | 
			
		||||
};            
 | 
			
		||||
 | 
			
		||||
(:~  Return new extract PDF doc as xs:base64Binary, using a 1 based page range  :)
 | 
			
		||||
(:~  Return new  PDF doc with pages from $start to $end as xs:base64Binary, (1 based)  
 | 
			
		||||
@param $start first page to include
 | 
			
		||||
@param $end last page to include
 | 
			
		||||
:)
 | 
			
		||||
declare function pdfbox:extract-range($pdf as item(), 
 | 
			
		||||
             $start as xs:integer,$end as xs:integer)
 | 
			
		||||
as xs:base64Binary
 | 
			
		||||
| 
						 | 
				
			
			@ -356,8 +397,10 @@ as xs:string{
 | 
			
		|||
  return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ return size of $pageNo (zero is cover :)
 | 
			
		||||
declare function pdfbox:page-size($pdf as item(), $pageNo as xs:integer)
 | 
			
		||||
(:~ return size of $pageNo (zero based)
 | 
			
		||||
@result e.g. [0.0,0.0,168.0,239.52]
 | 
			
		||||
 :)
 | 
			
		||||
declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer)
 | 
			
		||||
as xs:string{
 | 
			
		||||
  PDDocument:getPage($pdf, $pageNo)
 | 
			
		||||
  =>PDPage:getMediaBox()
 | 
			
		||||
| 
						 | 
				
			
			@ -380,7 +423,7 @@ as xs:string?{
 | 
			
		|||
};
 | 
			
		||||
 | 
			
		||||
(:~ fn:do-until shim for BaseX 9+10 
 | 
			
		||||
if  fn:do-until not found use hof:until
 | 
			
		||||
if  fn:do-until not found use hof:until, note: $pos always zero
 | 
			
		||||
:)
 | 
			
		||||
declare %private function pdfbox:do-until(
 | 
			
		||||
 $input 	as item()*, 	
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue