diff --git a/.gitignore b/.gitignore index aa4ef6d..7679106 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ data/ dist/ jars/* !jars/.gitignore -docs/xqdoc/ \ No newline at end of file +docs/xqdoca/ \ No newline at end of file diff --git a/docs/xqdoc/annotations.html b/docs/xqdoc/annotations.html new file mode 100644 index 0000000..8ca7cde --- /dev/null +++ b/docs/xqdoc/annotations.html @@ -0,0 +1,11 @@ +src - xqDocA - xqDocA

+ Project + src +  Annotations +

Summary

This project uses 1 annotation namespaces.

Related documents
ViewDescriptionFormat
reportIndex of sourcesxhtml
restxqSummary of REST interfacexhtml
importsSummary of import usagexhtml
imports-diagProject wide module imports as html mermaid class diagramhtml5
imports-diag.mmdProject wide module imports as a mermaid class diagramtext
xqdoca.xmlxqDocA run configuration report (XML)xml
xqdoc-validatevalidate generated xqdoc filesxml

Annotations

2.1 http://www.w3.org/2012/xquery

private
\ No newline at end of file diff --git a/docs/xqdoc/imports.html b/docs/xqdoc/imports.html new file mode 100644 index 0000000..8cee469 --- /dev/null +++ b/docs/xqdoc/imports.html @@ -0,0 +1,9 @@ +src - xqDocA - xqDocA

Project src +  Imports +

Summary

Lists all modules imported.

Related documents
ViewDescriptionFormat
reportIndex of sourcesxhtml
restxqSummary of REST interfacexhtml
imports-diagProject wide module imports as html mermaid class diagramhtml5
imports-diag.mmdProject wide module imports as a mermaid class diagramtext
annotationsSummary of XQuery annotation usexhtml
xqdoca.xmlxqDocA run configuration report (XML)xml
xqdoc-validatevalidate generated xqdoc filesxml

Imports (0)

\ No newline at end of file diff --git a/docs/xqdoc/index.html b/docs/xqdoc/index.html new file mode 100644 index 0000000..7f52373 --- /dev/null +++ b/docs/xqdoc/index.html @@ -0,0 +1,14 @@ +src - xqDocA - xqDocA

+ Project src +  XQuery source documentation +

Summary

The project + src contains + 1 XQuery source files, and uses + 1 annotation namespaces. +

This document was built from source folder C:/Users/mrwhe/git/expkg-zone58/pdfbox/src/ on + Sunday, 1st June 2025.

Related documents
ViewDescriptionFormat
reportIndex of sourcesxhtml
restxqSummary of REST interfacexhtml
importsSummary of import usagexhtml
imports-diagProject wide module imports as html mermaid class diagramhtml5
imports-diag.mmdProject wide module imports as a mermaid class diagramtext
annotationsSummary of XQuery annotation usexhtml
xqdoca.xmlxqDocA run configuration report (XML)xml
xqdoc-validatevalidate generated xqdoc filesxml

XQuery Main (0)

None

XQuery Library (1)

UriPrefixDescriptionUseAMetrics
org.expkg_zone58.Pdfbox3pdfbox + +A BaseX 10.7+ interface to pdfbox 3.0 https...
0
Library
↖0
P
V#1
F#31

File view (1)

Annotation namespaces (1)

A total of 7 annotations are defined. +

http://www.w3.org/2012/xquery

private7
\ No newline at end of file diff --git a/docs/xqdoc/mermaid.html b/docs/xqdoc/mermaid.html new file mode 100644 index 0000000..115ad7f --- /dev/null +++ b/docs/xqdoc/mermaid.html @@ -0,0 +1,35 @@ +Module imports diagram - xqDocA
--- +title: something here +config: + theme: base + securityLevel: loose +--- +classDiagram +direction TB + +class RESTXQ:::cssrest +class INVOKE:::cssmain +class TEST:::cssunit + +class pdfbox { << Pdfbox3.xqm >>} + + + +classDef cssrest fill:palegreen +classDef cssmain fill:powderblue +classDef cssunit fill:yellow + +link pdfbox "modules/F000001/index.html" "This is a tooltip for org.expkg_zone58.Pdfbox3" + + +
\ No newline at end of file diff --git a/docs/xqdoc/mermaid.mmd b/docs/xqdoc/mermaid.mmd new file mode 100644 index 0000000..05c5513 --- /dev/null +++ b/docs/xqdoc/mermaid.mmd @@ -0,0 +1,24 @@ +--- +title: something here +config: + theme: base + securityLevel: loose +--- +classDiagram +direction TB + +class RESTXQ:::cssrest +class INVOKE:::cssmain +class TEST:::cssunit + +class pdfbox { << Pdfbox3.xqm >>} + + + +classDef cssrest fill:palegreen +classDef cssmain fill:powderblue +classDef cssunit fill:yellow + +link pdfbox "modules/F000001/index.html" "This is a tooltip for org.expkg_zone58.Pdfbox3" + + diff --git a/docs/xqdoc/modules/F000001/index.html b/docs/xqdoc/modules/F000001/index.html new file mode 100644 index 0000000..7331d1c --- /dev/null +++ b/docs/xqdoc/modules/F000001/index.html @@ -0,0 +1,842 @@ +src - xqDocA - xqDocA

org.expkg_zone58.Pdfbox3  + library module
P

Summary

+ +A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , +requires pdfbox jars on classpath, i.e. in custom or xar +tested with pdfbox-app-3.0.5.jar +
See also
Authors
  • Andy Bunce 2025
Custom
Related documents
ViewDescriptionFormat
xqdocxqDoc xml file from the source modulexml
xqparsexqparse xml file from the source modulexml

Imports

+ This module is imported by + 0 modules. It imports + 0 modules. +

Variables

3.1 $pdfbox:property-map

Summary
+property access map +keys are property names, +values are sequences of functions to get property from $pdf object +
Type
References 14 functions from 3 modules
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getAuthor#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getCreationDate#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getCreator#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getKeywords#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getModificationDate#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getProducer#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getSubject#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getTitle#1
  • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentInformation#1
  • pdfbox:gregToISO#1
  • pdfbox:hasLabels#1
  • pdfbox:hasOutline#1
  • pdfbox:number-of-pages#1
  • pdfbox:specification#1
Annotations (1)
%private()
Source ( 35 lines)
variable $pdfbox:property-map:=map{
+  "pageCount": pdfbox:number-of-pages#1,
+
+  "hasOutline": pdfbox:hasOutline#1,
+
+  "hasLabels": pdfbox:hasLabels#1,
+
+  "specification":pdfbox:specification#1,
+
+  "title": (PDDocument:getDocumentInformation#1,
+            PDDocumentInformation:getTitle#1) ,
+
+  "author": (PDDocument:getDocumentInformation#1,
+             PDDocumentInformation:getAuthor#1 ),
+
+  "creator": (PDDocument:getDocumentInformation#1,
+              PDDocumentInformation:getCreator#1),
+
+  "producer": (PDDocument:getDocumentInformation#1,
+               PDDocumentInformation:getProducer#1),
+
+  "subject": (PDDocument:getDocumentInformation#1,
+              PDDocumentInformation:getSubject#1),
+
+  "keywords": (PDDocument:getDocumentInformation#1,
+               PDDocumentInformation:getKeywords#1),
+
+  "creationDate": (PDDocument:getDocumentInformation#1,
+                   PDDocumentInformation:getCreationDate#1,
+                   pdfbox:gregToISO#1),
+
+  "modificationDate":  (PDDocument:getDocumentInformation#1,
+                        PDDocumentInformation:getModificationDate#1,
+                        pdfbox:gregToISO#1)
+}

Functions

4.1 pdfbox:binary

Arities: #1

Summary
+Create binary representation of $pdf object as xs:base64Binary
Signatures
pdfbox:binary + ( + $pdf as item() ) as xs:base64Binary
Parameters
  • pdf as item()
Return
  • xs:base64Binary
Referenced by 1 functions from 1 modules
References 3 functions from 2 modules
  • {java:java.io.ByteArrayOutputStream}new#0
  • {java:java.io.ByteArrayOutputStream}toByteArray#1
  • {java:org.apache.pdfbox.pdmodel.PDDocument}save#2
Source ( 7 lines)
function pdfbox:binary($pdf as item())
+as xs:base64Binary{
+   let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
+   let $_:=PDDocument:save($pdf, $bytes)
+   return  Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
+         =>convert:integers-to-base64()
+}

4.2 pdfbox:bookmark

Arities: #2P

Summary
+return bookmark info for $bookmark +
Signatures
pdfbox:bookmark + ( + $bookmark as item(), $pdf as item() ) as map(*)
Parameters
  • bookmark as item()
  • pdf as item()
Return
  • map(*) map{index:..,title:..,hasChildren:..}
Referenced by 1 functions from 1 modules
References 3 functions from 1 modules
  • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}findDestinationPage#2
  • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getTitle#1
  • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}hasChildren#1
Annotations (1)
%private()
Source ( 10 lines)
function pdfbox:bookmark($bookmark as item(),$pdf as item())
+as map(*)
+{
+ map{ 
+  "index":  PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf),
+  "title":  (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}
+  (:=>translate("�",""), :),
+  "hasChildren": PDOutlineItem:hasChildren($bookmark)
+  }
+}

4.3 pdfbox:bookmark-xml

Arities: #1P

Summary
+recursive ouutline map to XML
Signatures
pdfbox:bookmark-xml + ( + $outline as map(*)* ) as element(bookmark)*
Parameters
  • outline as map(*)*
Return
  • element(bookmark) *
Referenced by 2 functions from 1 modules
References 1 functions from 1 modules
Annotations (1)
%private()
Source ( 8 lines)
function pdfbox:bookmark-xml($outline as map(*)*)
+as element(bookmark)*
+{
+  $outline!
+  <bookmark title="{?title}" index="{?index}">
+    {?children!pdfbox:bookmark-xml(.)}
+  </bookmark>
+}

4.4 pdfbox:close

Arities: #1

Summary
+Release any resources related to $pdf
Signatures
pdfbox:close + ( + $pdf as item() ) as empty-sequence
Parameters
  • pdf as item()
Return
  • empty-sequence
Referenced by 3 functions from 1 modules
References 1 functions from 1 modules
  • {java:org.apache.pdfbox.pdmodel.PDDocument}close#1
Source ( 6 lines)
function pdfbox:close($pdf as item())
+as empty-sequence(){
+  (# db:wrapjava void #) {
+     PDDocument:close($pdf)
+  }
+}

4.5 pdfbox:do-until

Arities: #3P

Summary
+fn:do-until shim for BaseX 9+10 +if fn:do-until not found use hof:until, note: $pos always zero +
Signatures
pdfbox:do-until + ( + $input as item()*, $action as function(item()*, xs:integer) as item()*, $predicate as function(item()*, xs:integer) as xs:boolean? ) as item()*
Parameters
  • input as item()*
  • action as function(item()*, xs:integer) as item()*
  • predicate as function(item()*, xs:integer) as xs:boolean?
Return
  • item() *
Referenced by 2 functions from 1 modules
References 5 functions from 2 modules
  • {http://www.w3.org/2001/XMLSchema}QName#1
  • {http://www.w3.org/2005/xpath-functions}QName#2
  • {http://www.w3.org/2005/xpath-functions}error#2
  • {http://www.w3.org/2005/xpath-functions}exists#1
  • {http://www.w3.org/2005/xpath-functions}function-lookup#2
Annotations (1)
%private()
Source ( 15 lines)
function pdfbox:do-until(
+ $input 	as item()*, 	
+ $action 	as function(item()*, xs:integer) as item()*, 	
+ $predicate 	as function(item()*, xs:integer) as xs:boolean? 	
+) as item()*
+{
+  let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3)
+  return if(exists($fn))
+         then $fn($input,$action,$predicate)
+         else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3)
+              return if(exists($hof))
+                      then $hof($predicate(?,0),$action(?,0),$input)
+                      else error(xs:QName('pdfbox:do-until'),"No implementation do-until found")
+
+}

4.6 pdfbox:extract-range

Arities: #3

Summary
+Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) +
Signatures
pdfbox:extract-range + ( + $pdf as item(), $start as xs:integer, $end as xs:integer ) as xs:base64Binary
Parameters
  • pdf as item()
  • start as xs:integer first page to include
  • end as xs:integer last page to include
Return
  • xs:base64Binary
Referenced by 0 functions from 0 modules
    References 3 functions from 2 modules
    Source ( 7 lines)
    function pdfbox:extract-range($pdf as item(), 
    +             $start as xs:integer,$end as xs:integer)
    +as xs:base64Binary
    +{
    +    let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract()
    +    return (pdfbox:binary($a),pdfbox:close($a)) 
    +}

    4.7 pdfbox:find-page

    Arities: #2

    Summary
    +pageIndex of $page in $pdf
    Signatures
    pdfbox:find-page + ( + $page as item()?, $pdf as item() ) as item()?
    Parameters
    • page as item()?
    • pdf as item()
    Return
    • item() ?
    Referenced by 0 functions from 0 modules
      References 2 functions from 2 modules
      • {http://www.w3.org/2005/xpath-functions}exists#1
      • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
      Source ( 10 lines)
      function pdfbox:find-page(
      +   $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
      +   $pdf as item())
      +as item()?
      +{
      +  if(exists($page))
      +  then PDDocument:getDocumentCatalog($pdf)
      +      =>PDDocumentCatalog:getPages()
      +      =>PDPageTree:indexOf($page)
      +}

      4.8 pdfbox:gregToISO

      Arities: #1P

      Summary
      +convert date
      Signatures
      pdfbox:gregToISO + ( + $item as item()? ) as xs:string?
      Parameters
      • item as item()?
      Return
      • xs:string ?
      Referenced by 0 functions from 0 modules
        References 2 functions from 2 modules
        • {http://www.w3.org/2005/xpath-functions}exists#1
        • {java:java.util.GregorianCalendar}toZonedDateTime#1
        Annotations (1)
        %private()
        Source ( 6 lines)
        function pdfbox:gregToISO($item as item()?)
        +as xs:string?{
        + if(exists($item))
        + then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
        + else ()
        +}

        4.9 pdfbox:hasLabels

        Arities: #1

        Summary
        +true if $pdf has Labels
        Signatures
        pdfbox:hasLabels + ( + $pdf as item() ) as xs:boolean
        Parameters
        • pdf as item()
        Return
        • xs:boolean
        Referenced by 0 functions from 0 modules
          References 1 functions from 1 modules
          • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
          Source ( 6 lines)
          function pdfbox:hasLabels($pdf as item())
          +as xs:boolean{
          +  PDDocument:getDocumentCatalog($pdf)
          +  =>PDDocumentCatalog:getPageLabels()
          +  =>exists()
          +}

          4.10 pdfbox:hasOutline

          Arities: #1

          Summary
          +true if $pdf has an outline
          Signatures
          pdfbox:hasOutline + ( + $pdf as item() ) as xs:boolean
          Parameters
          • pdf as item()
          Return
          • xs:boolean
          Referenced by 0 functions from 0 modules
            References 1 functions from 1 modules
            • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
            Source ( 6 lines)
            function pdfbox:hasOutline($pdf as item())
            +as xs:boolean{
            +  PDDocument:getDocumentCatalog($pdf)
            +  =>PDDocumentCatalog:getDocumentOutline()
            +  =>exists()
            +}

            4.11 pdfbox:labels

            Arities: #1

            Summary
            +pageLabel for every page or empty if none +
            Signatures
            pdfbox:labels + ( + $pdf as item() ) as xs:string*
            Parameters
            • pdf as item()
            Return
            • xs:string *
            Tags
            Referenced by 0 functions from 0 modules
              References 3 functions from 3 modules
              • {http://www.w3.org/2005/xpath-functions}exists#1
              • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
              • {java:org.apache.pdfbox.pdmodel.common.PDPageLabels}getLabelsByPageIndices#1
              Source ( 9 lines)
              function pdfbox:labels($pdf as item())
              +as xs:string*
              +{
              +  let $pagelabels:=PDDocument:getDocumentCatalog($pdf)
              +                   =>PDDocumentCatalog:getPageLabels()
              +  return if(exists($pagelabels))
              +         then PDPageLabels:getLabelsByPageIndices($pagelabels)
              +         else ()
              +}

              4.12 pdfbox:metadata

              Arities: #1

              Summary
              +XMP metadata as "RDF" document +
              Signatures
              pdfbox:metadata + ( + $pdf as item() ) as document-node(element(*))?
              Parameters
              • pdf as item()
              Return
              • document-node(element(*)) ?
              Tags
              • @note: + usually rdf:RDF root, but sometimes x:xmpmeta
              Referenced by 0 functions from 0 modules
                References 5 functions from 4 modules
                • {http://www.w3.org/2005/xpath-functions}exists#1
                • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
                • {java:org.apache.pdfbox.pdmodel.common.PDMetadata}exportXMPMetadata#1
                • pdfbox:do-until#3
                • pdfbox:read-stream#2
                Source ( 17 lines)
                function pdfbox:metadata($pdf as item())
                +as document-node(element(*))?
                +{
                +  let $m:=PDDocument:getDocumentCatalog($pdf)
                +         =>PDDocumentCatalog:getMetadata()
                +  return  if(exists($m))
                +          then 
                +              let $is:=PDMetadata:exportXMPMetadata($m)
                +              return pdfbox:do-until(
                +                        map{"n":0,"data":""},
                +
                +                        function($input,$pos ) {  pdfbox:read-stream($is,$input?data)},
                +
                +                        function($output,$pos) { $output?n eq -1 }     
                +                     )?data=>parse-xml()
                +          else ()
                +}

                4.13 pdfbox:number-of-pages

                Arities: #1

                Summary
                +Number of pages in PDF
                Signatures
                pdfbox:number-of-pages + ( + $pdf as item() ) as xs:integer
                Parameters
                • pdf as item()
                Return
                • xs:integer
                Referenced by 0 functions from 0 modules
                  References 1 functions from 1 modules
                  • {java:org.apache.pdfbox.pdmodel.PDDocument}getNumberOfPages#1
                  Source ( 4 lines)
                  function pdfbox:number-of-pages($pdf as item())
                  +as xs:integer{
                  +  PDDocument:getNumberOfPages($pdf)
                  +}

                  4.14 pdfbox:open

                  Arities: #1#2

                  Summary
                  +open pdf using fetch:binary, returns pdf object
                  Signatures
                  pdfbox:open + ( + $pdfsrc as item() ) as item()
                  pdfbox:open + ( + $pdfsrc as item(), $opts as map(*) ) as item()
                  Parameters
                  • pdfsrc as item() a fetchable url or filepath, or xs:base64Binary item
                  • opts as map(*) options otionally with map {"password":}
                  Return
                  • item()
                  Referenced by 3 functions from 1 modules
                  References 8 functions from 6 modules
                  • {http://basex.org/modules/fetch}binary#1
                  • {http://www.w3.org/2001/XMLSchema}QName#1
                  • {http://www.w3.org/2005/xpath-functions}error#2
                  • {http://www.w3.org/2005/xpath-functions}starts-with#2
                  • {http://www.w3.org/2005/xpath-functions}string#1
                  • {java:org.apache.pdfbox.Loader}loadPDF#2
                  • {java:org.apache.pdfbox.io.RandomAccessReadBufferedFile}new#1
                  • pdfbox:open#2
                  Source ( 21 lines)
                  function pdfbox:open($pdfsrc as item())
                  +as item(){
                  +pdfbox:open($pdfsrc, map{})
                  +}
                  function pdfbox:open($pdfsrc as item(), $opts as map(*))
                  +as item(){
                  +  try{
                  +
                  +      if($pdfsrc instance of xs:base64Binary)
                  +      then Loader:loadPDF( $pdfsrc,string($opts?password))
                  +      else if(starts-with($pdfsrc,"http"))
                  +           then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password))
                  +           else  Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password))
                  +
                  +} catch *{
                  +    let $loc:=if($pdfsrc instance of xs:base64Binary)
                  +              then "xs:base64Binary"
                  +              else $pdfsrc
                  +    return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description)
                  +}
                  +}

                  4.15 pdfbox:outline

                  Arities: #1#2

                  Summary
                  +outline for $pdf as map()*
                  Signatures
                  pdfbox:outline + ( + $pdf as item() ) as map(*)*
                  pdfbox:outline + ( + $pdf as item(), $outlineItem as item()? ) as map(*)*
                  Parameters
                  • pdf as item()
                  • outlineItem as item()?
                  Return
                  • map(*) *
                  Referenced by 3 functions from 1 modules
                  References 6 functions from 5 modules
                  • {http://www.w3.org/2005/xpath-functions/map}get#2
                  • {http://www.w3.org/2005/xpath-functions}exists#1
                  • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
                  • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getFirstChild#1
                  • pdfbox:outline#2
                  • pdfbox:outline_#2
                  Source ( 16 lines)
                  function pdfbox:outline($pdf as item())
                  +as map(*)*{
                  +  (# db:wrapjava some #) {
                  +  let $outline:=
                  +                PDDocument:getDocumentCatalog($pdf)
                  +                =>PDDocumentCatalog:getDocumentOutline()
                  + 
                  +  return  if(exists($outline))
                  +          then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) 
                  +  }
                  +}
                  function pdfbox:outline($pdf as item(),$outlineItem as item()?)
                  +as map(*)*{
                  +  let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem)
                  +  return map:get($find,"list")
                  +}

                  4.16 pdfbox:outline-xml

                  Arities: #1

                  Summary
                  +PDF outline in xml format
                  Signatures
                  pdfbox:outline-xml + ( + $pdf as item() ) as element(outline)?
                  Parameters
                  • pdf as item()
                  Return
                  • element(outline) ?
                  Referenced by 0 functions from 0 modules
                    References 3 functions from 2 modules
                    Source ( 7 lines)
                    function pdfbox:outline-xml($pdf as item())
                    +as element(outline)?{
                    + let $outline:=pdfbox:outline($pdf)
                    +  return if(exists($outline))
                    +         then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline>
                    +         else ()
                    +}

                    4.17 pdfbox:outline_

                    Arities: #2P

                    Summary
                    +BaseX bug 10.7? error if inlined in outline
                    Signatures
                    pdfbox:outline_ + ( + $pdf as item(), $outlineItem as item()? ) as map(*)
                    Parameters
                    • pdf as item()
                    • outlineItem as item()?
                    Return
                    • map(*)
                    Referenced by 1 functions from 1 modules
                    References 8 functions from 4 modules
                    • {http://www.w3.org/2005/xpath-functions/map}entry#2
                    • {http://www.w3.org/2005/xpath-functions/map}merge#1
                    • {http://www.w3.org/2005/xpath-functions}empty#1
                    • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getFirstChild#1
                    • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getNextSibling#1
                    • pdfbox:bookmark#2
                    • pdfbox:do-until#3
                    • pdfbox:outline#2
                    Annotations (1)
                    %private()
                    Source ( 20 lines)
                    function pdfbox:outline_($pdf as item(),$outlineItem as item()?)
                    +as map(*){
                    +  pdfbox:do-until(
                    +    
                    +     map{"list":(),"this":$outlineItem},
                    +
                    +     function($input,$pos ) { 
                    +        let $bk:= pdfbox:bookmark($input?this,$pdf)
                    +        let $bk:= if($bk?hasChildren)
                    +                  then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this))
                    +                        return map:merge(($bk,map:entry("children",$kids)))
                    +                  else $bk 
                    +        return map{
                    +              "list": ($input?list, $bk),
                    +              "this":  PDOutlineItem:getNextSibling($input?this)}
                    +      },
                    +
                    +     function($output,$pos) { empty($output?this) }                      
                    +  )
                    +}

                    4.18 pdfbox:page-media-box

                    Arities: #2

                    Summary
                    +return size of $pageNo (zero based) +
                    Signatures
                    pdfbox:page-media-box + ( + $pdf as item(), $pageNo as xs:integer ) as xs:string
                    Parameters
                    • pdf as item()
                    • pageNo as xs:integer
                    Return
                    • xs:string
                    Tags
                    • @result: + e.g. [0.0,0.0,168.0,239.52]
                    Referenced by 0 functions from 0 modules
                      References 1 functions from 1 modules
                      • {java:org.apache.pdfbox.pdmodel.PDDocument}getPage#2
                      Source ( 6 lines)
                      function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer)
                      +as xs:string{
                      +  PDDocument:getPage($pdf, $pageNo)
                      +  =>PDPage:getMediaBox()
                      +  =>PDRectangle:toString()
                      +}

                      4.19 pdfbox:page-render

                      Arities: #3

                      Summary
                      +Pdf page as image (zero is cover) +options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi??
                      Signatures
                      pdfbox:page-render + ( + $pdf as item(), $pageNo as xs:integer, $options as map(*) ) as xs:base64Binary
                      Parameters
                      • pdf as item()
                      • pageNo as xs:integer
                      • options as map(*)
                      Return
                      • xs:base64Binary
                      Referenced by 0 functions from 0 modules
                        References 5 functions from 4 modules
                        • {http://www.w3.org/2005/xpath-functions/map}merge#1
                        • {java:java.io.ByteArrayOutputStream}new#0
                        • {java:java.io.ByteArrayOutputStream}toByteArray#1
                        • {java:javax.imageio.ImageIO}write#3
                        • {java:org.apache.pdfbox.rendering.PDFRenderer}new#1
                        Source ( 10 lines)
                        function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*))
                        +as xs:base64Binary{
                        +  let $options:=map:merge(($options,map{"format":"jpg","scale":1}))
                        +  let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale)
                        +  let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
                        +  let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format,  $bytes)
                        +  return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
                        +         =>convert:integers-to-base64()
                        + 
                        +}

                        4.20 pdfbox:page-text

                        Arities: #2

                        Summary
                        +return text on $pageNo
                        Signatures
                        pdfbox:page-text + ( + $pdf as item(), $pageNo as xs:integer ) as xs:string
                        Parameters
                        • pdf as item()
                        • pageNo as xs:integer
                        Return
                        • xs:string
                        Referenced by 0 functions from 0 modules
                          References 2 functions from 1 modules
                          • {java:org.apache.pdfbox.text.PDFTextStripper}getText#2
                          • {java:org.apache.pdfbox.text.PDFTextStripper}new#0
                          Source ( 9 lines)
                          function pdfbox:page-text($pdf as item(), $pageNo as xs:integer)
                          +as xs:string{
                          +  let $tStripper := (# db:wrapjava instance #) {
                          +         PDFTextStripper:new()
                          +         => PDFTextStripper:setStartPage($pageNo)
                          +         => PDFTextStripper:setEndPage($pageNo)
                          +       }
                          +  return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
                          +}

                          4.21 pdfbox:property

                          Arities: #2

                          Summary
                          +return value of $property for $pdf
                          Signatures
                          pdfbox:property + ( + $pdf as item(), $property as xs:string ) as item()*
                          Parameters
                          • pdf as item()
                          • property as xs:string
                          Return
                          • item() *
                          Referenced by 1 functions from 1 modules
                          References 5 functions from 2 modules
                          • {http://www.w3.org/2001/XMLSchema}QName#1
                          • {http://www.w3.org/2005/xpath-functions}concat#3
                          • {http://www.w3.org/2005/xpath-functions}error#2
                          • {http://www.w3.org/2005/xpath-functions}exists#1
                          • {http://www.w3.org/2005/xpath-functions}fold-left#3
                          Source ( 9 lines)
                          function pdfbox:property($pdf as item(),$property as xs:string)
                          +as item()*{
                          +  let $fns:= $pdfbox:property-map($property)
                          +  return if(exists($fns))
                          +         then fold-left($fns, 
                          +                        $pdf, 
                          +                        function($result,$this as function(*)){$this($result)})
                          +         else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined."))
                          +}

                          4.22 pdfbox:property-names

                          Arities: #0

                          Summary
                          +known property names sorted
                          Signatures
                          pdfbox:property-names + ( + ) as xs:string*
                          Return
                          • xs:string *
                          Referenced by 0 functions from 0 modules
                            Source ( 4 lines)
                            function pdfbox:property-names() 
                            +as xs:string*{
                            +  $pdfbox:property-map=>map:keys()=>sort()
                            +}

                            4.23 pdfbox:read-stream

                            Arities: #2P

                            Summary
                            +read next block from XMP stream
                            Signatures
                            pdfbox:read-stream + ( + $is, $read as xs:string ) as map(*)
                            Parameters
                            • is as 
                            • read as xs:string
                            Return
                            • map(*)
                            Referenced by 1 functions from 1 modules
                            References 6 functions from 5 modules
                            • {http://basex.org/modules/convert}integers-to-base64#1
                            • {http://www.w3.org/2001/XMLSchema}byte#1
                            • {http://www.w3.org/2001/XMLSchema}int#1
                            • {http://www.w3.org/2005/xpath-functions}subsequence#3
                            • {java:java.util.Arrays}copyOf#2
                            • {java:org.apache.pdfbox.cos.COSInputStream}read#4
                            Annotations (1)
                            %private()
                            Source ( 8 lines)
                            function pdfbox:read-stream($is,$read as xs:string)
                            +as map(*){
                            +  let $blen:=4096
                            +  let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen)
                            +  let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen))
                            +  let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string()
                            +  return map{"n":$n, "data": $read || $data}
                            +}

                            4.24 pdfbox:report

                            Arities: #1#2

                            Summary
                            +summary CSV style info for all properties for $pdfpaths +
                            Signatures
                            pdfbox:report + ( + $pdfpaths as xs:string* ) as map(*)
                            pdfbox:report + ( + $pdfpaths as item()*, $properties as xs:string* ) as map(*)
                            Parameters
                            • pdfpaths as item()*
                            • properties as xs:string*
                            Return
                            • map(*)
                            Tags
                            Referenced by 1 functions from 1 modules
                            References 8 functions from 4 modules
                            Source ( 28 lines)
                            function pdfbox:report($pdfpaths as xs:string*)
                            +as map(*){
                            + pdfbox:report($pdfpaths,map:keys($pdfbox:property-map))
                            +}
                            function pdfbox:report($pdfpaths as item()*, $properties as xs:string*)
                            +as map(*){
                            +  map{"names":   array{"path",$properties},
                            +  
                            +      "records": for $path in $pdfpaths
                            +                 let $name:=if($path instance of xs:base64Binary) then "binary" else $path
                            +                 return try{
                            +                  let $pdf:=pdfbox:open($path)
                            +                  return (fold-left($properties,
                            +                                  array{$name},
                            +                                  function($result as array(*),$prop as xs:string){
                            +                                    array:append($result, string(pdfbox:property($pdf, $prop)))}
                            +                         ), pdfbox:close($pdf)
                            +                         )
                            +                 } catch *{
                            +                      fold-left($properties,
                            +                                array{$name},
                            +                                function($result as array(*),$prop as xs:string){
                            +                                    array:append($result, "#ERROR")}
                            +                               )
                            +                 }
                            +               
                            +  }
                            +}

                            4.25 pdfbox:save

                            Arities: #2

                            Summary
                            +Save pdf $pdf to filesystem at $savepath , returns $savepath
                            Signatures
                            pdfbox:save + ( + $pdf as item(), $savepath as xs:string ) as xs:string
                            Parameters
                            • pdf as item()
                            • savepath as xs:string
                            Return
                            • xs:string
                            Referenced by 0 functions from 0 modules
                              References 2 functions from 2 modules
                              • {java:java.io.File}new#1
                              • {java:org.apache.pdfbox.pdmodel.PDDocument}save#2
                              Source ( 4 lines)
                              function pdfbox:save($pdf as item(),$savepath as xs:string)
                              +as xs:string{
                              +   PDDocument:save($pdf, File:new($savepath)),$savepath
                              +}

                              4.26 pdfbox:specification

                              Arities: #1

                              Summary
                              +The version of the PDF specification used by $pdf e.g "1.4" +returned as string to avoid float rounding issues +
                              Signatures
                              pdfbox:specification + ( + $pdf as item() ) as xs:string
                              Parameters
                              • pdf as item()
                              Return
                              • xs:string
                              Referenced by 0 functions from 0 modules
                                References 1 functions from 1 modules
                                • {java:org.apache.pdfbox.pdmodel.PDDocument}getVersion#1
                                Source ( 4 lines)
                                function pdfbox:specification($pdf as item())
                                +as xs:string{
                                + PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string()
                                +}

                                4.27 pdfbox:version

                                Arities: #0

                                Summary
                                +version of Apache Pdfbox in use e.g. "3.0.4"
                                Signatures
                                pdfbox:version + ( + ) as xs:string
                                Return
                                • xs:string
                                Referenced by 0 functions from 0 modules
                                  References 1 functions from 1 modules
                                  • {java:org.apache.pdfbox.util.Version}getVersion#0
                                  Source ( 4 lines)
                                  function pdfbox:version()
                                  +as xs:string{
                                  +  Q{java:org.apache.pdfbox.util.Version}getVersion()
                                  +}

                                  4.28 pdfbox:with-pdf

                                  Arities: #2

                                  Summary
                                  +with-document pattern: open pdf,apply function, close pdf +creates a local pdfobject and ensures it is closed after use +e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) +
                                  Signatures
                                  pdfbox:with-pdf + ( + $src as xs:string, $fn as function(item())as item()* ) as item()*
                                  Parameters
                                  • src as xs:string
                                  • fn as function(item())as item()*
                                  Return
                                  • item() *
                                  Referenced by 0 functions from 0 modules
                                    References 3 functions from 2 modules
                                    Source ( 11 lines)
                                    function pdfbox:with-pdf($src as xs:string,
                                    +                                $fn as function(item())as item()*)
                                    +as item()*{
                                    + let $pdf:=pdfbox:open($src)
                                    + return try{
                                    +        $fn($pdf),pdfbox:close($pdf)
                                    +        } catch *{
                                    +          pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description)
                                    +        }
                                    +
                                    +}

                                    Namespaces

                                    The following namespaces are defined:

                                    Prefix -Uri -
                                    arrayhttp://www.w3.org/2005/xpath-functions/array
                                    converthttp://basex.org/modules/convert
                                    COSInputStreamjava:org.apache.pdfbox.cos.COSInputStream
                                    dbhttp://basex.org/modules/db
                                    errhttp://www.w3.org/2005/xqt-errors
                                    fetchhttp://basex.org/modules/fetch
                                    Filejava:java.io.File
                                    fnhttp://www.w3.org/2005/xpath-functions
                                    Loaderjava:org.apache.pdfbox.Loader
                                    maphttp://www.w3.org/2005/xpath-functions/map
                                    PageExtractorjava:org.apache.pdfbox.multipdf.PageExtractor
                                    PDDocumentjava:org.apache.pdfbox.pdmodel.PDDocument
                                    PDDocumentCatalogjava:org.apache.pdfbox.pdmodel.PDDocumentCatalog
                                    PDDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformation
                                    PDDocumentOutlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline
                                    pdfboxorg.expkg_zone58.Pdfbox3
                                    PDFRendererjava:org.apache.pdfbox.rendering.PDFRenderer
                                    PDFTextStripperjava:org.apache.pdfbox.text.PDFTextStripper
                                    PDMetadatajava:org.apache.pdfbox.pdmodel.common.PDMetadata
                                    PDOutlineItemjava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem
                                    PDPagejava:org.apache.pdfbox.pdmodel.PDPage
                                    PDPageLabelsjava:org.apache.pdfbox.pdmodel.common.PDPageLabels
                                    PDPageTreejava:org.apache.pdfbox.pdmodel.PDPageTree
                                    PDRectangleorg.apache.pdfbox.pdmodel.common.PDRectangle
                                    RandomAccessReadBufferjava:org.apache.pdfbox.io.RandomAccessReadBuffer
                                    RandomAccessReadBufferedFilejava:org.apache.pdfbox.io.RandomAccessReadBufferedFile
                                    rdfhttp://www.w3.org/1999/02/22-rdf-syntax-ns#
                                    xshttp://www.w3.org/2001/XMLSchema

                                    6 RestXQ

                                    None

                                    Source Code

                                    xquery version '3.1';
                                    +(:~ 
                                    +A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , 
                                    +requires pdfbox jars on classpath, i.e. in custom or xar
                                    +tested with pdfbox-app-3.0.5.jar
                                    +@see https://pdfbox.apache.org/download.cgi
                                    +@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/
                                    +@author Andy Bunce 2025
                                    +:)
                                    +
                                    +module namespace pdfbox="org.expkg_zone58.Pdfbox3";
                                    +
                                    +declare namespace Loader ="java:org.apache.pdfbox.Loader"; 
                                    +declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper";
                                    +declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument";
                                    +declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog";
                                    +declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels";
                                    +declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor";
                                    +declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage";
                                    +declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree";
                                    +declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline";
                                    +declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
                                    +declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
                                    +declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
                                    +declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata";
                                    +declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream";
                                    +
                                    +declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
                                    +
                                    +
                                    +declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer";
                                    +declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
                                    +declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle";
                                    +
                                    +declare namespace File ="java:java.io.File";
                                    +
                                    +
                                    +
                                    +(:~ with-document pattern: open pdf,apply function, close pdf
                                    + creates a local pdfobject and ensures it is closed after use
                                    +e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
                                    +:)
                                    +declare function pdfbox:with-pdf($src as xs:string,
                                    +                                $fn as function(item())as item()*)
                                    +as item()*{
                                    + let $pdf:=pdfbox:open($src)
                                    + return try{
                                    +        $fn($pdf),pdfbox:close($pdf)
                                    +        } catch *{
                                    +          pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description)
                                    +        }
                                    +
                                    +};
                                    +
                                    +
                                    +(:~ open pdf using fetch:binary, returns pdf object :)
                                    +declare function pdfbox:open($pdfsrc as item())
                                    +as item(){
                                    +pdfbox:open($pdfsrc, map{})
                                    +};
                                    +
                                    +(:~ open pdf from file/url/binary, opts may have password , returns pdf object 
                                    +@param $pdfsrc a fetchable url or filepath, or xs:base64Binary item
                                    +@param $opts options otionally with map {"password":} 
                                    +:)
                                    +declare function pdfbox:open($pdfsrc as item(), $opts as map(*))
                                    +as item(){
                                    +  try{
                                    +
                                    +      if($pdfsrc instance of xs:base64Binary)
                                    +      then Loader:loadPDF( $pdfsrc,string($opts?password))
                                    +      else if(starts-with($pdfsrc,"http"))
                                    +           then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password))
                                    +           else  Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password))
                                    +
                                    +} catch *{
                                    +    let $loc:=if($pdfsrc instance of xs:base64Binary)
                                    +              then "xs:base64Binary"
                                    +              else $pdfsrc
                                    +    return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description)
                                    +}
                                    +};
                                    +
                                    +(:~ The version of the PDF specification used by $pdf  e.g "1.4"
                                    +returned as string to avoid float rounding issues
                                    + :)
                                    +declare function pdfbox:specification($pdf as item())
                                    +as xs:string{
                                    + PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string()
                                    +};
                                    +
                                    +(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :)
                                    +declare function pdfbox:save($pdf as item(),$savepath as xs:string)
                                    +as xs:string{
                                    +   PDDocument:save($pdf, File:new($savepath)),$savepath
                                    +};
                                    +
                                    +(:~ Create binary representation of $pdf object as xs:base64Binary :)
                                    +declare function pdfbox:binary($pdf as item())
                                    +as xs:base64Binary{
                                    +   let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
                                    +   let $_:=PDDocument:save($pdf, $bytes)
                                    +   return  Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
                                    +         =>convert:integers-to-base64()
                                    +};
                                    +
                                    +(:~ Release any resources related to $pdf:)
                                    +declare function pdfbox:close($pdf as item())
                                    +as empty-sequence(){
                                    +  (# db:wrapjava void #) {
                                    +     PDDocument:close($pdf)
                                    +  }
                                    +};
                                    +
                                    +(:~ Number of pages in PDF:)
                                    +declare function pdfbox:number-of-pages($pdf as item())
                                    +as xs:integer{
                                    +  PDDocument:getNumberOfPages($pdf)
                                    +};
                                    +
                                    +(:~ Pdf page as image (zero is cover)
                                    +options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :)
                                    +declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*))
                                    +as xs:base64Binary{
                                    +  let $options:=map:merge(($options,map{"format":"jpg","scale":1}))
                                    +  let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale)
                                    +  let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
                                    +  let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format,  $bytes)
                                    +  return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
                                    +         =>convert:integers-to-base64()
                                    + 
                                    +};
                                    +
                                    +
                                    +(:~ property access map
                                    +   keys are property names, 
                                    +   values are sequences of functions to get property from $pdf object
                                    +:)
                                    +declare %private variable $pdfbox:property-map:=map{
                                    +  "pageCount": pdfbox:number-of-pages#1,
                                    +
                                    +  "hasOutline": pdfbox:hasOutline#1,
                                    +
                                    +  "hasLabels": pdfbox:hasLabels#1,
                                    +
                                    +  "specification":pdfbox:specification#1,
                                    +
                                    +  "title": (PDDocument:getDocumentInformation#1,
                                    +            PDDocumentInformation:getTitle#1) ,
                                    +
                                    +  "author": (PDDocument:getDocumentInformation#1,
                                    +             PDDocumentInformation:getAuthor#1 ),
                                    +
                                    +  "creator": (PDDocument:getDocumentInformation#1,
                                    +              PDDocumentInformation:getCreator#1),
                                    +
                                    +  "producer": (PDDocument:getDocumentInformation#1,
                                    +               PDDocumentInformation:getProducer#1),
                                    +
                                    +  "subject": (PDDocument:getDocumentInformation#1,
                                    +              PDDocumentInformation:getSubject#1),
                                    +
                                    +  "keywords": (PDDocument:getDocumentInformation#1,
                                    +               PDDocumentInformation:getKeywords#1),
                                    +
                                    +  "creationDate": (PDDocument:getDocumentInformation#1,
                                    +                   PDDocumentInformation:getCreationDate#1,
                                    +                   pdfbox:gregToISO#1),
                                    +
                                    +  "modificationDate":  (PDDocument:getDocumentInformation#1,
                                    +                        PDDocumentInformation:getModificationDate#1,
                                    +                        pdfbox:gregToISO#1)
                                    +};
                                    +
                                    +(:~ known property names sorted :)
                                    +declare function pdfbox:property-names() 
                                    +as xs:string*{
                                    +  $pdfbox:property-map=>map:keys()=>sort()
                                    +};
                                    +
                                    +(:~  return value of $property for $pdf :)
                                    +declare function pdfbox:property($pdf as item(),$property as xs:string)
                                    +as item()*{
                                    +  let $fns:= $pdfbox:property-map($property)
                                    +  return if(exists($fns))
                                    +         then fold-left($fns, 
                                    +                        $pdf, 
                                    +                        function($result,$this as function(*)){$this($result)})
                                    +         else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined."))
                                    +};
                                    +
                                    +(:~ summary CSV style info for all properties for $pdfpaths 
                                    +:)
                                    +declare function pdfbox:report($pdfpaths as xs:string*)
                                    +as map(*){
                                    + pdfbox:report($pdfpaths,map:keys($pdfbox:property-map))
                                    +};
                                    +
                                    +(:~ summary CSV style info for named properties for $pdfpaths 
                                    +@see https://docs.basex.org/main/CSV_Functions#xquery
                                    +:)
                                    +declare function pdfbox:report($pdfpaths as item()*, $properties as xs:string*)
                                    +as map(*){
                                    +  map{"names":   array{"path",$properties},
                                    +  
                                    +      "records": for $path in $pdfpaths
                                    +                 let $name:=if($path instance of xs:base64Binary) then "binary" else $path
                                    +                 return try{
                                    +                  let $pdf:=pdfbox:open($path)
                                    +                  return (fold-left($properties,
                                    +                                  array{$name},
                                    +                                  function($result as array(*),$prop as xs:string){
                                    +                                    array:append($result, string(pdfbox:property($pdf, $prop)))}
                                    +                         ), pdfbox:close($pdf)
                                    +                         )
                                    +                 } catch *{
                                    +                      fold-left($properties,
                                    +                                array{$name},
                                    +                                function($result as array(*),$prop as xs:string){
                                    +                                    array:append($result, "#ERROR")}
                                    +                               )
                                    +                 }
                                    +               
                                    +  }
                                    +};
                                    +
                                    +(:~ true if $pdf has an outline :)
                                    +declare function pdfbox:hasOutline($pdf as item())
                                    +as xs:boolean{
                                    +  PDDocument:getDocumentCatalog($pdf)
                                    +  =>PDDocumentCatalog:getDocumentOutline()
                                    +  =>exists()
                                    +};
                                    +
                                    +(:~ true if $pdf has Labels :)
                                    +declare function pdfbox:hasLabels($pdf as item())
                                    +as xs:boolean{
                                    +  PDDocument:getDocumentCatalog($pdf)
                                    +  =>PDDocumentCatalog:getPageLabels()
                                    +  =>exists()
                                    +};
                                    +
                                    +(:~ XMP metadata as "RDF" document
                                    +@note usually rdf:RDF root, but sometimes x:xmpmeta 
                                    +:)
                                    +declare function pdfbox:metadata($pdf as item())
                                    +as document-node(element(*))?
                                    +{
                                    +  let $m:=PDDocument:getDocumentCatalog($pdf)
                                    +         =>PDDocumentCatalog:getMetadata()
                                    +  return  if(exists($m))
                                    +          then 
                                    +              let $is:=PDMetadata:exportXMPMetadata($m)
                                    +              return pdfbox:do-until(
                                    +                        map{"n":0,"data":""},
                                    +
                                    +                        function($input,$pos ) {  pdfbox:read-stream($is,$input?data)},
                                    +
                                    +                        function($output,$pos) { $output?n eq -1 }     
                                    +                     )?data=>parse-xml()
                                    +          else ()
                                    +};
                                    +
                                    +(:~ read next block from XMP stream :)
                                    +declare %private function pdfbox:read-stream($is,$read as xs:string)
                                    +as map(*){
                                    +  let $blen:=4096
                                    +  let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen)
                                    +  let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen))
                                    +  let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string()
                                    +  return map{"n":$n, "data": $read || $data}
                                    +};
                                    +
                                    +(:~ outline for $pdf as map()* :)
                                    +declare function pdfbox:outline($pdf as item())
                                    +as map(*)*{
                                    +  (# db:wrapjava some #) {
                                    +  let $outline:=
                                    +                PDDocument:getDocumentCatalog($pdf)
                                    +                =>PDDocumentCatalog:getDocumentOutline()
                                    + 
                                    +  return  if(exists($outline))
                                    +          then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) 
                                    +  }
                                    +};
                                    +
                                    +(:~ return bookmark info for children of $outlineItem as seq of maps :)
                                    +declare function pdfbox:outline($pdf as item(),$outlineItem as item()?)
                                    +as map(*)*{
                                    +  let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem)
                                    +  return map:get($find,"list")
                                    +};
                                    +
                                    +(:~ BaseX bug 10.7? error if inlined in outline :)
                                    +declare %private function pdfbox:outline_($pdf as item(),$outlineItem as item()?)
                                    +as map(*){
                                    +  pdfbox:do-until(
                                    +    
                                    +     map{"list":(),"this":$outlineItem},
                                    +
                                    +     function($input,$pos ) { 
                                    +        let $bk:= pdfbox:bookmark($input?this,$pdf)
                                    +        let $bk:= if($bk?hasChildren)
                                    +                  then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this))
                                    +                        return map:merge(($bk,map:entry("children",$kids)))
                                    +                  else $bk 
                                    +        return map{
                                    +              "list": ($input?list, $bk),
                                    +              "this":  PDOutlineItem:getNextSibling($input?this)}
                                    +      },
                                    +
                                    +     function($output,$pos) { empty($output?this) }                      
                                    +  )
                                    +};
                                    +
                                    +(:~ PDF outline in xml format :)
                                    +declare function pdfbox:outline-xml($pdf as item())
                                    +as element(outline)?{
                                    + let $outline:=pdfbox:outline($pdf)
                                    +  return if(exists($outline))
                                    +         then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline>
                                    +         else ()
                                    +};
                                    +
                                    +(:~ recursive ouutline map to XML :)
                                    +declare %private function pdfbox:bookmark-xml($outline as map(*)*)
                                    +as element(bookmark)*
                                    +{
                                    +  $outline!
                                    +  <bookmark title="{?title}" index="{?index}">
                                    +    {?children!pdfbox:bookmark-xml(.)}
                                    +  </bookmark>
                                    +};
                                    +
                                    +(:~ return bookmark info for $bookmark
                                    +@return map{index:..,title:..,hasChildren:..}
                                    +:)
                                    +declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item())
                                    +as map(*)
                                    +{
                                    + map{ 
                                    +  "index":  PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf),
                                    +  "title":  (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}
                                    +  (:=>translate("�",""), :),
                                    +  "hasChildren": PDOutlineItem:hasChildren($bookmark)
                                    +  }
                                    +};
                                    +
                                    +
                                    +(:~ pageIndex of $page in $pdf :)
                                    +declare function pdfbox:find-page(
                                    +   $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
                                    +   $pdf as item())
                                    +as item()?
                                    +{
                                    +  if(exists($page))
                                    +  then PDDocument:getDocumentCatalog($pdf)
                                    +      =>PDDocumentCatalog:getPages()
                                    +      =>PDPageTree:indexOf($page)
                                    +};            
                                    +
                                    +(:~  Return new  PDF doc with pages from $start to $end as xs:base64Binary, (1 based)  
                                    +@param $start first page to include
                                    +@param $end last page to include
                                    +:)
                                    +declare function pdfbox:extract-range($pdf as item(), 
                                    +             $start as xs:integer,$end as xs:integer)
                                    +as xs:base64Binary
                                    +{
                                    +    let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract()
                                    +    return (pdfbox:binary($a),pdfbox:close($a)) 
                                    +};
                                    +
                                    +
                                    +(:~   pageLabel for every page or empty if none
                                    +@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples
                                    +@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files
                                    +:)
                                    +declare function pdfbox:labels($pdf as item())
                                    +as xs:string*
                                    +{
                                    +  let $pagelabels:=PDDocument:getDocumentCatalog($pdf)
                                    +                   =>PDDocumentCatalog:getPageLabels()
                                    +  return if(exists($pagelabels))
                                    +         then PDPageLabels:getLabelsByPageIndices($pagelabels)
                                    +         else ()
                                    +};
                                    +
                                    +(:~ return text on $pageNo :)
                                    +declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer)
                                    +as xs:string{
                                    +  let $tStripper := (# db:wrapjava instance #) {
                                    +         PDFTextStripper:new()
                                    +         => PDFTextStripper:setStartPage($pageNo)
                                    +         => PDFTextStripper:setEndPage($pageNo)
                                    +       }
                                    +  return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
                                    +};
                                    +
                                    +(:~ return size of $pageNo (zero based)
                                    +@result e.g. [0.0,0.0,168.0,239.52]
                                    + :)
                                    +declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer)
                                    +as xs:string{
                                    +  PDDocument:getPage($pdf, $pageNo)
                                    +  =>PDPage:getMediaBox()
                                    +  =>PDRectangle:toString()
                                    +};
                                    +
                                    +(:~  version of Apache Pdfbox in use  e.g. "3.0.4" :)
                                    +declare function pdfbox:version()
                                    +as xs:string{
                                    +  Q{java:org.apache.pdfbox.util.Version}getVersion()
                                    +};
                                    +
                                    +(:~ convert date :)
                                    +declare %private
                                    +function pdfbox:gregToISO($item as item()?)
                                    +as xs:string?{
                                    + if(exists($item))
                                    + then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
                                    + else ()
                                    +};
                                    +
                                    +(:~ fn:do-until shim for BaseX 9+10 
                                    +if  fn:do-until not found use hof:until, note: $pos always zero
                                    +:)
                                    +declare %private function pdfbox:do-until(
                                    + $input 	as item()*, 	
                                    + $action 	as function(item()*, xs:integer) as item()*, 	
                                    + $predicate 	as function(item()*, xs:integer) as xs:boolean? 	
                                    +) as item()*
                                    +{
                                    +  let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3)
                                    +  return if(exists($fn))
                                    +         then $fn($input,$action,$predicate)
                                    +         else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3)
                                    +              return if(exists($hof))
                                    +                      then $hof($predicate(?,0),$action(?,0),$input)
                                    +                      else error(xs:QName('pdfbox:do-until'),"No implementation do-until found")
                                    +
                                    +};
                                    +
                                    \ No newline at end of file diff --git a/docs/xqdoc/modules/F000001/xqdoc.xml b/docs/xqdoc/modules/F000001/xqdoc.xml new file mode 100644 index 0000000..8509c11 --- /dev/null +++ b/docs/xqdoc/modules/F000001/xqdoc.xml @@ -0,0 +1,774 @@ +2025-06-01T21:16:07.687+01:001.1org.expkg_zone58.Pdfbox3pdfbox + +A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , +requires pdfbox jars on classpath, i.e. in custom or xar +tested with pdfbox-app-3.0.5.jar +Andy Bunce 2025https://pdfbox.apache.org/download.cgihttps://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/xquery version '3.1'; +(:~ +A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , +requires pdfbox jars on classpath, i.e. in custom or xar +tested with pdfbox-app-3.0.5.jar +@see https://pdfbox.apache.org/download.cgi +@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/ +@author Andy Bunce 2025 +:) + +module namespace pdfbox="org.expkg_zone58.Pdfbox3"; + +declare namespace Loader ="java:org.apache.pdfbox.Loader"; +declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper"; +declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; +declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; +declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; +declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; +declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage"; +declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; +declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline"; +declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; +declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; +declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; +declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata"; +declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream"; + +declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + + +declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; +declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; +declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle"; + +declare namespace File ="java:java.io.File"; + + + +(:~ with-document pattern: open pdf,apply function, close pdf + creates a local pdfobject and ensures it is closed after use +e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) +:) +declare function pdfbox:with-pdf($src as xs:string, + $fn as function(item())as item()*) +as item()*{ + let $pdf:=pdfbox:open($src) + return try{ + $fn($pdf),pdfbox:close($pdf) + } catch *{ + pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) + } + +}; + + +(:~ open pdf using fetch:binary, returns pdf object :) +declare function pdfbox:open($pdfsrc as item()) +as item(){ +pdfbox:open($pdfsrc, map{}) +}; + +(:~ open pdf from file/url/binary, opts may have password , returns pdf object +@param $pdfsrc a fetchable url or filepath, or xs:base64Binary item +@param $opts options otionally with map {"password":} +:) +declare function pdfbox:open($pdfsrc as item(), $opts as map(*)) +as item(){ + try{ + + if($pdfsrc instance of xs:base64Binary) + then Loader:loadPDF( $pdfsrc,string($opts?password)) + else if(starts-with($pdfsrc,"http")) + then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) + else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) + +} catch *{ + let $loc:=if($pdfsrc instance of xs:base64Binary) + then "xs:base64Binary" + else $pdfsrc + return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) +} +}; + +(:~ The version of the PDF specification used by $pdf e.g "1.4" +returned as string to avoid float rounding issues + :) +declare function pdfbox:specification($pdf as item()) +as xs:string{ + PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() +}; + +(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) +declare function pdfbox:save($pdf as item(),$savepath as xs:string) +as xs:string{ + PDDocument:save($pdf, File:new($savepath)),$savepath +}; + +(:~ Create binary representation of $pdf object as xs:base64Binary :) +declare function pdfbox:binary($pdf as item()) +as xs:base64Binary{ + let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() + let $_:=PDDocument:save($pdf, $bytes) + return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) + =>convert:integers-to-base64() +}; + +(:~ Release any resources related to $pdf:) +declare function pdfbox:close($pdf as item()) +as empty-sequence(){ + (# db:wrapjava void #) { + PDDocument:close($pdf) + } +}; + +(:~ Number of pages in PDF:) +declare function pdfbox:number-of-pages($pdf as item()) +as xs:integer{ + PDDocument:getNumberOfPages($pdf) +}; + +(:~ Pdf page as image (zero is cover) +options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) +declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) +as xs:base64Binary{ + let $options:=map:merge(($options,map{"format":"jpg","scale":1})) + let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) + let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() + let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) + return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) + =>convert:integers-to-base64() + +}; + + +(:~ property access map + keys are property names, + values are sequences of functions to get property from $pdf object +:) +declare %private variable $pdfbox:property-map:=map{ + "pageCount": pdfbox:number-of-pages#1, + + "hasOutline": pdfbox:hasOutline#1, + + "hasLabels": pdfbox:hasLabels#1, + + "specification":pdfbox:specification#1, + + "title": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getTitle#1) , + + "author": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getAuthor#1 ), + + "creator": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getCreator#1), + + "producer": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getProducer#1), + + "subject": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getSubject#1), + + "keywords": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getKeywords#1), + + "creationDate": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getCreationDate#1, + pdfbox:gregToISO#1), + + "modificationDate": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getModificationDate#1, + pdfbox:gregToISO#1) +}; + +(:~ known property names sorted :) +declare function pdfbox:property-names() +as xs:string*{ + $pdfbox:property-map=>map:keys()=>sort() +}; + +(:~ return value of $property for $pdf :) +declare function pdfbox:property($pdf as item(),$property as xs:string) +as item()*{ + let $fns:= $pdfbox:property-map($property) + return if(exists($fns)) + then fold-left($fns, + $pdf, + function($result,$this as function(*)){$this($result)}) + else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) +}; + +(:~ summary CSV style info for all properties for $pdfpaths +:) +declare function pdfbox:report($pdfpaths as xs:string*) +as map(*){ + pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) +}; + +(:~ summary CSV style info for named properties for $pdfpaths +@see https://docs.basex.org/main/CSV_Functions#xquery +:) +declare function pdfbox:report($pdfpaths as item()*, $properties as xs:string*) +as map(*){ + map{"names": array{"path",$properties}, + + "records": for $path in $pdfpaths + let $name:=if($path instance of xs:base64Binary) then "binary" else $path + return try{ + let $pdf:=pdfbox:open($path) + return (fold-left($properties, + array{$name}, + function($result as array(*),$prop as xs:string){ + array:append($result, string(pdfbox:property($pdf, $prop)))} + ), pdfbox:close($pdf) + ) + } catch *{ + fold-left($properties, + array{$name}, + function($result as array(*),$prop as xs:string){ + array:append($result, "#ERROR")} + ) + } + + } +}; + +(:~ true if $pdf has an outline :) +declare function pdfbox:hasOutline($pdf as item()) +as xs:boolean{ + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getDocumentOutline() + =>exists() +}; + +(:~ true if $pdf has Labels :) +declare function pdfbox:hasLabels($pdf as item()) +as xs:boolean{ + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + =>exists() +}; + +(:~ XMP metadata as "RDF" document +@note usually rdf:RDF root, but sometimes x:xmpmeta +:) +declare function pdfbox:metadata($pdf as item()) +as document-node(element(*))? +{ + let $m:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getMetadata() + return if(exists($m)) + then + let $is:=PDMetadata:exportXMPMetadata($m) + return pdfbox:do-until( + map{"n":0,"data":""}, + + function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, + + function($output,$pos) { $output?n eq -1 } + )?data=>parse-xml() + else () +}; + +(:~ read next block from XMP stream :) +declare %private function pdfbox:read-stream($is,$read as xs:string) +as map(*){ + let $blen:=4096 + let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) + let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) + let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() + return map{"n":$n, "data": $read || $data} +}; + +(:~ outline for $pdf as map()* :) +declare function pdfbox:outline($pdf as item()) +as map(*)*{ + (# db:wrapjava some #) { + let $outline:= + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getDocumentOutline() + + return if(exists($outline)) + then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) + } +}; + +(:~ return bookmark info for children of $outlineItem as seq of maps :) +declare function pdfbox:outline($pdf as item(),$outlineItem as item()?) +as map(*)*{ + let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) + return map:get($find,"list") +}; + +(:~ BaseX bug 10.7? error if inlined in outline :) +declare %private function pdfbox:outline_($pdf as item(),$outlineItem as item()?) +as map(*){ + pdfbox:do-until( + + map{"list":(),"this":$outlineItem}, + + function($input,$pos ) { + let $bk:= pdfbox:bookmark($input?this,$pdf) + let $bk:= if($bk?hasChildren) + then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) + return map:merge(($bk,map:entry("children",$kids))) + else $bk + return map{ + "list": ($input?list, $bk), + "this": PDOutlineItem:getNextSibling($input?this)} + }, + + function($output,$pos) { empty($output?this) } + ) +}; + +(:~ PDF outline in xml format :) +declare function pdfbox:outline-xml($pdf as item()) +as element(outline)?{ + let $outline:=pdfbox:outline($pdf) + return if(exists($outline)) + then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline> + else () +}; + +(:~ recursive ouutline map to XML :) +declare %private function pdfbox:bookmark-xml($outline as map(*)*) +as element(bookmark)* +{ + $outline! + <bookmark title="{?title}" index="{?index}"> + {?children!pdfbox:bookmark-xml(.)} + </bookmark> +}; + +(:~ return bookmark info for $bookmark +@return map{index:..,title:..,hasChildren:..} +:) +declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) +as map(*) +{ + map{ + "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), + "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} + (:=>translate("�",""), :), + "hasChildren": PDOutlineItem:hasChildren($bookmark) + } +}; + + +(:~ pageIndex of $page in $pdf :) +declare function pdfbox:find-page( + $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), + $pdf as item()) +as item()? +{ + if(exists($page)) + then PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPages() + =>PDPageTree:indexOf($page) +}; + +(:~ Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) +@param $start first page to include +@param $end last page to include +:) +declare function pdfbox:extract-range($pdf as item(), + $start as xs:integer,$end as xs:integer) +as xs:base64Binary +{ + let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() + return (pdfbox:binary($a),pdfbox:close($a)) +}; + + +(:~ pageLabel for every page or empty if none +@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples +@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files +:) +declare function pdfbox:labels($pdf as item()) +as xs:string* +{ + let $pagelabels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return if(exists($pagelabels)) + then PDPageLabels:getLabelsByPageIndices($pagelabels) + else () +}; + +(:~ return text on $pageNo :) +declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer) +as xs:string{ + let $tStripper := (# db:wrapjava instance #) { + PDFTextStripper:new() + => PDFTextStripper:setStartPage($pageNo) + => PDFTextStripper:setEndPage($pageNo) + } + return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} +}; + +(:~ return size of $pageNo (zero based) +@result e.g. [0.0,0.0,168.0,239.52] + :) +declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) +as xs:string{ + PDDocument:getPage($pdf, $pageNo) + =>PDPage:getMediaBox() + =>PDRectangle:toString() +}; + +(:~ version of Apache Pdfbox in use e.g. "3.0.4" :) +declare function pdfbox:version() +as xs:string{ + Q{java:org.apache.pdfbox.util.Version}getVersion() +}; + +(:~ convert date :) +declare %private +function pdfbox:gregToISO($item as item()?) +as xs:string?{ + if(exists($item)) + then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() + else () +}; + +(:~ fn:do-until shim for BaseX 9+10 +if fn:do-until not found use hof:until, note: $pos always zero +:) +declare %private function pdfbox:do-until( + $input as item()*, + $action as function(item()*, xs:integer) as item()*, + $predicate as function(item()*, xs:integer) as xs:boolean? +) as item()* +{ + let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3) + return if(exists($fn)) + then $fn($input,$action,$predicate) + else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3) + return if(exists($hof)) + then $hof($predicate(?,0),$action(?,0),$input) + else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") + +}; +pdfbox:property-map +property access map +keys are property names, +values are sequences of functions to get property from $pdf object +org.expkg_zone58.Pdfbox3number-of-pagesorg.expkg_zone58.Pdfbox3hasOutlineorg.expkg_zone58.Pdfbox3hasLabelsorg.expkg_zone58.Pdfbox3specificationjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetTitlejava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetAuthorjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetCreatorjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetProducerjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetSubjectjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetKeywordsjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetCreationDateorg.expkg_zone58.Pdfbox3gregToISOjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetModificationDateorg.expkg_zone58.Pdfbox3gregToISOvariable $pdfbox:property-map:=map{ + "pageCount": pdfbox:number-of-pages#1, + + "hasOutline": pdfbox:hasOutline#1, + + "hasLabels": pdfbox:hasLabels#1, + + "specification":pdfbox:specification#1, + + "title": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getTitle#1) , + + "author": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getAuthor#1 ), + + "creator": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getCreator#1), + + "producer": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getProducer#1), + + "subject": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getSubject#1), + + "keywords": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getKeywords#1), + + "creationDate": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getCreationDate#1, + pdfbox:gregToISO#1), + + "modificationDate": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getModificationDate#1, + pdfbox:gregToISO#1) +} +with-document pattern: open pdf,apply function, close pdf +creates a local pdfobject and ensures it is closed after use +e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) +pdfbox:with-pdffunction pdfbox:with-pdf ( $src as xs:string, $fn as function(item())as item()* ) as item()* { let $pdf:=pdfbox:open($src) return try{ $fn($pdf),pdfbox:close($pdf) } catch *{ pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) } }srcxs:stringfnfunction(item())as item()*item()org.expkg_zone58.Pdfbox3openorg.expkg_zone58.Pdfbox3closeorg.expkg_zone58.Pdfbox3closehttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2005/xqt-errorscodehttp://www.w3.org/2005/xqt-errorsdescriptionfunction pdfbox:with-pdf($src as xs:string, + $fn as function(item())as item()*) +as item()*{ + let $pdf:=pdfbox:open($src) + return try{ + $fn($pdf),pdfbox:close($pdf) + } catch *{ + pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) + } + +} +open pdf using fetch:binary, returns pdf objectpdfbox:openfunction pdfbox:open ( $pdfsrc as item() ) as item() { pdfbox:open($pdfsrc, map{}) }pdfsrcitem()item()org.expkg_zone58.Pdfbox3openfunction pdfbox:open($pdfsrc as item()) +as item(){ +pdfbox:open($pdfsrc, map{}) +} +open pdf from file/url/binary, opts may have password , returns pdf object +$pdfsrc a fetchable url or filepath, or xs:base64Binary item$opts options otionally with map {"password":}pdfbox:openfunction pdfbox:open ( $pdfsrc as item(), $opts as map(*) ) as item() { try{ if($pdfsrc instance of xs:base64Binary) then Loader:loadPDF( $pdfsrc,string($opts?password)) else if(starts-with($pdfsrc,"http")) then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) } catch *{ let $loc:=if($pdfsrc instance of xs:base64Binary) then "xs:base64Binary" else $pdfsrc return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) } }pdfsrcitem()optsmap(*)item()java:org.apache.pdfbox.LoaderloadPDFhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionsstarts-withjava:org.apache.pdfbox.LoaderloadPDFhttp://basex.org/modules/fetchbinaryhttp://www.w3.org/2005/xpath-functionsstringjava:org.apache.pdfbox.LoaderloadPDFjava:org.apache.pdfbox.io.RandomAccessReadBufferedFilenewhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xqt-errorsdescriptionfunction pdfbox:open($pdfsrc as item(), $opts as map(*)) +as item(){ + try{ + + if($pdfsrc instance of xs:base64Binary) + then Loader:loadPDF( $pdfsrc,string($opts?password)) + else if(starts-with($pdfsrc,"http")) + then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) + else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) + +} catch *{ + let $loc:=if($pdfsrc instance of xs:base64Binary) + then "xs:base64Binary" + else $pdfsrc + return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) +} +} +The version of the PDF specification used by $pdf e.g "1.4" +returned as string to avoid float rounding issues +pdfbox:specificationfunction pdfbox:specification ( $pdf as item() ) as xs:string { PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() }pdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetVersionfunction pdfbox:specification($pdf as item()) +as xs:string{ + PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() +} +Save pdf $pdf to filesystem at $savepath , returns $savepathpdfbox:savefunction pdfbox:save ( $pdf as item(),$savepath as xs:string ) as xs:string { PDDocument:save($pdf, File:new($savepath)),$savepath }pdfitem()savepathxs:stringxs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentsavejava:java.io.Filenewfunction pdfbox:save($pdf as item(),$savepath as xs:string) +as xs:string{ + PDDocument:save($pdf, File:new($savepath)),$savepath +} +Create binary representation of $pdf object as xs:base64Binarypdfbox:binaryfunction pdfbox:binary ( $pdf as item() ) as xs:base64Binary { let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() let $_:=PDDocument:save($pdf, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() }pdfitem()xs:base64Binaryjava:java.io.ByteArrayOutputStreamnewjava:org.apache.pdfbox.pdmodel.PDDocumentsavejava:java.io.ByteArrayOutputStreamtoByteArrayfunction pdfbox:binary($pdf as item()) +as xs:base64Binary{ + let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() + let $_:=PDDocument:save($pdf, $bytes) + return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) + =>convert:integers-to-base64() +} +Release any resources related to $pdfpdfbox:closefunction pdfbox:close ( $pdf as item() ) as empty-sequence() { (# db:wrapjava void #) { PDDocument:close($pdf) } }pdfitem()empty-sequencejava:org.apache.pdfbox.pdmodel.PDDocumentclosefunction pdfbox:close($pdf as item()) +as empty-sequence(){ + (# db:wrapjava void #) { + PDDocument:close($pdf) + } +} +Number of pages in PDFpdfbox:number-of-pagesfunction pdfbox:number-of-pages ( $pdf as item() ) as xs:integer { PDDocument:getNumberOfPages($pdf) }pdfitem()xs:integerjava:org.apache.pdfbox.pdmodel.PDDocumentgetNumberOfPagesfunction pdfbox:number-of-pages($pdf as item()) +as xs:integer{ + PDDocument:getNumberOfPages($pdf) +} +Pdf page as image (zero is cover) +options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi??pdfbox:page-renderfunction pdfbox:page-render ( $pdf as item(),$pageNo as xs:integer,$options as map(*) ) as xs:base64Binary { let $options:=map:merge(($options,map{"format":"jpg","scale":1})) let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() }pdfitem()pageNoxs:integeroptionsmap(*)xs:base64Binaryhttp://www.w3.org/2005/xpath-functions/mapmergejava:org.apache.pdfbox.rendering.PDFRenderernewjava:java.io.ByteArrayOutputStreamnewjava:javax.imageio.ImageIOwritejava:java.io.ByteArrayOutputStreamtoByteArrayfunction pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) +as xs:base64Binary{ + let $options:=map:merge(($options,map{"format":"jpg","scale":1})) + let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) + let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() + let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) + return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) + =>convert:integers-to-base64() + +} +known property names sortedpdfbox:property-namesfunction pdfbox:property-names ( ) as xs:string* { $pdfbox:property-map=>map:keys()=>sort() }xs:stringorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:property-names() +as xs:string*{ + $pdfbox:property-map=>map:keys()=>sort() +} +return value of $property for $pdfpdfbox:propertyfunction pdfbox:property ( $pdf as item(),$property as xs:string ) as item()* { let $fns:= $pdfbox:property-map($property) return if(exists($fns)) then fold-left($fns, $pdf, function($result,$this as function(*)){$this($result)}) else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }pdfitem()propertyxs:stringitem()http://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xpath-functionsconcatorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:property($pdf as item(),$property as xs:string) +as item()*{ + let $fns:= $pdfbox:property-map($property) + return if(exists($fns)) + then fold-left($fns, + $pdf, + function($result,$this as function(*)){$this($result)}) + else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) +} +summary CSV style info for all properties for $pdfpaths +pdfbox:reportfunction pdfbox:report ( $pdfpaths as xs:string* ) as map(*) { pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) }pdfpathsxs:stringmap(*)org.expkg_zone58.Pdfbox3reporthttp://www.w3.org/2005/xpath-functions/mapkeysorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:report($pdfpaths as xs:string*) +as map(*){ + pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) +} +summary CSV style info for named properties for $pdfpaths +https://docs.basex.org/main/CSV_Functions#xquerypdfbox:reportfunction pdfbox:report ( $pdfpaths as item()*, $properties as xs:string* ) as map(*) { map{"names": array{"path",$properties}, "records": for $path in $pdfpaths let $name:=if($path instance of xs:base64Binary) then "binary" else $path return try{ let $pdf:=pdfbox:open($path) return (fold-left($properties, array{$name}, function($result as array(*),$prop as xs:string){ array:append($result, string(pdfbox:property($pdf, $prop)))} ), pdfbox:close($pdf) ) } catch *{ fold-left($properties, array{$name}, function($result as array(*),$prop as xs:string){ array:append($result, "#ERROR")} ) } } }pdfpathsitem()propertiesxs:stringmap(*)org.expkg_zone58.Pdfbox3openhttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functions/arrayappendhttp://www.w3.org/2005/xpath-functionsstringorg.expkg_zone58.Pdfbox3propertyorg.expkg_zone58.Pdfbox3closehttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functions/arrayappendfunction pdfbox:report($pdfpaths as item()*, $properties as xs:string*) +as map(*){ + map{"names": array{"path",$properties}, + + "records": for $path in $pdfpaths + let $name:=if($path instance of xs:base64Binary) then "binary" else $path + return try{ + let $pdf:=pdfbox:open($path) + return (fold-left($properties, + array{$name}, + function($result as array(*),$prop as xs:string){ + array:append($result, string(pdfbox:property($pdf, $prop)))} + ), pdfbox:close($pdf) + ) + } catch *{ + fold-left($properties, + array{$name}, + function($result as array(*),$prop as xs:string){ + array:append($result, "#ERROR")} + ) + } + + } +} +true if $pdf has an outlinepdfbox:hasOutlinefunction pdfbox:hasOutline ( $pdf as item() ) as xs:boolean { PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getDocumentOutline() =>exists() }pdfitem()xs:booleanjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:hasOutline($pdf as item()) +as xs:boolean{ + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getDocumentOutline() + =>exists() +} +true if $pdf has Labelspdfbox:hasLabelsfunction pdfbox:hasLabels ( $pdf as item() ) as xs:boolean { PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() =>exists() }pdfitem()xs:booleanjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:hasLabels($pdf as item()) +as xs:boolean{ + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + =>exists() +} +XMP metadata as "RDF" document +usually rdf:RDF root, but sometimes x:xmpmetapdfbox:metadatafunction pdfbox:metadata ( $pdf as item() ) as document-node(element(*))? { let $m:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getMetadata() return if(exists($m)) then let $is:=PDMetadata:exportXMPMetadata($m) return pdfbox:do-until( map{"n":0,"data":""}, function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, function($output,$pos) { $output?n eq -1 } )?data=>parse-xml() else () }pdfitem()document-node(element(*))java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.common.PDMetadataexportXMPMetadataorg.expkg_zone58.Pdfbox3do-untilorg.expkg_zone58.Pdfbox3read-streamfunction pdfbox:metadata($pdf as item()) +as document-node(element(*))? +{ + let $m:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getMetadata() + return if(exists($m)) + then + let $is:=PDMetadata:exportXMPMetadata($m) + return pdfbox:do-until( + map{"n":0,"data":""}, + + function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, + + function($output,$pos) { $output?n eq -1 } + )?data=>parse-xml() + else () +} +read next block from XMP streampdfbox:read-streamfunction pdfbox:read-stream ( $is,$read as xs:string ) as map(*) { let $blen:=4096 let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() return map{"n":$n, "data": $read || $data} }isreadxs:stringmap(*)java:java.util.ArrayscopyOfhttp://www.w3.org/2001/XMLSchemabytejava:org.apache.pdfbox.cos.COSInputStreamreadhttp://www.w3.org/2001/XMLSchemainthttp://www.w3.org/2001/XMLSchemainthttp://basex.org/modules/convertintegers-to-base64http://www.w3.org/2005/xpath-functionssubsequencefunction pdfbox:read-stream($is,$read as xs:string) +as map(*){ + let $blen:=4096 + let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) + let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) + let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() + return map{"n":$n, "data": $read || $data} +} +outline for $pdf as map()*pdfbox:outlinefunction pdfbox:outline ( $pdf as item() ) as map(*)* { (# db:wrapjava some #) { let $outline:= PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getDocumentOutline() return if(exists($outline)) then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) } }pdfitem()map(*)java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsorg.expkg_zone58.Pdfbox3outlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetFirstChildfunction pdfbox:outline($pdf as item()) +as map(*)*{ + (# db:wrapjava some #) { + let $outline:= + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getDocumentOutline() + + return if(exists($outline)) + then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) + } +} +return bookmark info for children of $outlineItem as seq of mapspdfbox:outlinefunction pdfbox:outline ( $pdf as item(),$outlineItem as item()? ) as map(*)* { let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) return map:get($find,"list") }pdfitem()outlineItemitem()map(*)org.expkg_zone58.Pdfbox3outline_http://www.w3.org/2005/xpath-functions/mapgetfunction pdfbox:outline($pdf as item(),$outlineItem as item()?) +as map(*)*{ + let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) + return map:get($find,"list") +} +BaseX bug 10.7? error if inlined in outlinepdfbox:outline_function pdfbox:outline_ ( $pdf as item(),$outlineItem as item()? ) as map(*) { pdfbox:do-until( map{"list":(),"this":$outlineItem}, function($input,$pos ) { let $bk:= pdfbox:bookmark($input?this,$pdf) let $bk:= if($bk?hasChildren) then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) return map:merge(($bk,map:entry("children",$kids))) else $bk return map{ "list": ($input?list, $bk), "this": PDOutlineItem:getNextSibling($input?this)} }, function($output,$pos) { empty($output?this) } ) }pdfitem()outlineItemitem()map(*)org.expkg_zone58.Pdfbox3do-untilorg.expkg_zone58.Pdfbox3bookmarkorg.expkg_zone58.Pdfbox3outlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetFirstChildhttp://www.w3.org/2005/xpath-functions/mapmergehttp://www.w3.org/2005/xpath-functions/mapentryjava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetNextSiblinghttp://www.w3.org/2005/xpath-functionsemptyfunction pdfbox:outline_($pdf as item(),$outlineItem as item()?) +as map(*){ + pdfbox:do-until( + + map{"list":(),"this":$outlineItem}, + + function($input,$pos ) { + let $bk:= pdfbox:bookmark($input?this,$pdf) + let $bk:= if($bk?hasChildren) + then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) + return map:merge(($bk,map:entry("children",$kids))) + else $bk + return map{ + "list": ($input?list, $bk), + "this": PDOutlineItem:getNextSibling($input?this)} + }, + + function($output,$pos) { empty($output?this) } + ) +} +PDF outline in xml formatpdfbox:outline-xmlfunction pdfbox:outline-xml ( $pdf as item() ) as element(outline)? { let $outline:=pdfbox:outline($pdf) return if(exists($outline)) then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline> else () }pdfitem()element(outline)org.expkg_zone58.Pdfbox3outlinehttp://www.w3.org/2005/xpath-functionsexistsorg.expkg_zone58.Pdfbox3bookmark-xmlfunction pdfbox:outline-xml($pdf as item()) +as element(outline)?{ + let $outline:=pdfbox:outline($pdf) + return if(exists($outline)) + then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline> + else () +} +recursive ouutline map to XMLpdfbox:bookmark-xmlfunction pdfbox:bookmark-xml ( $outline as map(*)* ) as element(bookmark)* { $outline! <bookmark title="{?title}" index="{?index}"> {?children!pdfbox:bookmark-xml(.)} </bookmark> }outlinemap(*)element(bookmark)org.expkg_zone58.Pdfbox3bookmark-xmlfunction pdfbox:bookmark-xml($outline as map(*)*) +as element(bookmark)* +{ + $outline! + <bookmark title="{?title}" index="{?index}"> + {?children!pdfbox:bookmark-xml(.)} + </bookmark> +} +return bookmark info for $bookmark +map{index:..,title:..,hasChildren:..}pdfbox:bookmarkfunction pdfbox:bookmark ( $bookmark as item(),$pdf as item() ) as map(*) { map{ "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} (:=>translate("�",""), :), "hasChildren": PDOutlineItem:hasChildren($bookmark) } }bookmarkitem()pdfitem()map(*)java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemfindDestinationPagejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetTitlejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemhasChildrenfunction pdfbox:bookmark($bookmark as item(),$pdf as item()) +as map(*) +{ + map{ + "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), + "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} + (:=>translate("�",""), :), + "hasChildren": PDOutlineItem:hasChildren($bookmark) + } +} +pageIndex of $page in $pdfpdfbox:find-pagefunction pdfbox:find-page ( $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), $pdf as item() ) as item()? { if(exists($page)) then PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPages() =>PDPageTree:indexOf($page) }pageitem()pdfitem()item()http://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:find-page( + $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), + $pdf as item()) +as item()? +{ + if(exists($page)) + then PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPages() + =>PDPageTree:indexOf($page) +} +Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) +$start first page to include$end last page to includepdfbox:extract-rangefunction pdfbox:extract-range ( $pdf as item(), $start as xs:integer,$end as xs:integer ) as xs:base64Binary { let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() return (pdfbox:binary($a),pdfbox:close($a)) }pdfitem()startxs:integerendxs:integerxs:base64Binaryjava:org.apache.pdfbox.multipdf.PageExtractorneworg.expkg_zone58.Pdfbox3binaryorg.expkg_zone58.Pdfbox3closefunction pdfbox:extract-range($pdf as item(), + $start as xs:integer,$end as xs:integer) +as xs:base64Binary +{ + let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() + return (pdfbox:binary($a),pdfbox:close($a)) +} +pageLabel for every page or empty if none +https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-exampleshttps://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-filespdfbox:labelsfunction pdfbox:labels ( $pdf as item() ) as xs:string* { let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() return if(exists($pagelabels)) then PDPageLabels:getLabelsByPageIndices($pagelabels) else () }pdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetLabelsByPageIndicesfunction pdfbox:labels($pdf as item()) +as xs:string* +{ + let $pagelabels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return if(exists($pagelabels)) + then PDPageLabels:getLabelsByPageIndices($pagelabels) + else () +} +return text on $pageNopdfbox:page-textfunction pdfbox:page-text ( $pdf as item(), $pageNo as xs:integer ) as xs:string { let $tStripper := (# db:wrapjava instance #) { PDFTextStripper:new() => PDFTextStripper:setStartPage($pageNo) => PDFTextStripper:setEndPage($pageNo) } return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} }pdfitem()pageNoxs:integerxs:stringjava:org.apache.pdfbox.text.PDFTextStrippernewjava:org.apache.pdfbox.text.PDFTextStrippergetTextfunction pdfbox:page-text($pdf as item(), $pageNo as xs:integer) +as xs:string{ + let $tStripper := (# db:wrapjava instance #) { + PDFTextStripper:new() + => PDFTextStripper:setStartPage($pageNo) + => PDFTextStripper:setEndPage($pageNo) + } + return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} +} +return size of $pageNo (zero based) +e.g. [0.0,0.0,168.0,239.52]pdfbox:page-media-boxfunction pdfbox:page-media-box ( $pdf as item(), $pageNo as xs:integer ) as xs:string { PDDocument:getPage($pdf, $pageNo) =>PDPage:getMediaBox() =>PDRectangle:toString() }pdfitem()pageNoxs:integerxs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetPagefunction pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) +as xs:string{ + PDDocument:getPage($pdf, $pageNo) + =>PDPage:getMediaBox() + =>PDRectangle:toString() +} +version of Apache Pdfbox in use e.g. "3.0.4"pdfbox:versionfunction pdfbox:version ( ) as xs:string { Q{java:org.apache.pdfbox.util.Version}getVersion() }xs:stringjava:org.apache.pdfbox.util.VersiongetVersionfunction pdfbox:version() +as xs:string{ + Q{java:org.apache.pdfbox.util.Version}getVersion() +} +convert datepdfbox:gregToISOfunction pdfbox:gregToISO ( $item as item()? ) as xs:string? { if(exists($item)) then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() else () }itemitem()xs:stringhttp://www.w3.org/2005/xpath-functionsexistsjava:java.util.GregorianCalendartoZonedDateTimefunction pdfbox:gregToISO($item as item()?) +as xs:string?{ + if(exists($item)) + then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() + else () +} +fn:do-until shim for BaseX 9+10 +if fn:do-until not found use hof:until, note: $pos always zero +pdfbox:do-untilfunction pdfbox:do-until ( $input as item()*, $action as function(item()*, xs:integer) as item()*, $predicate as function(item()*, xs:integer) as xs:boolean? ) as item()* { let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3) return if(exists($fn)) then $fn($input,$action,$predicate) else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3) return if(exists($hof)) then $hof($predicate(?,0),$action(?,0),$input) else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") }inputitem()actionfunction(item()*, xs:integer) as item()*predicatefunction(item()*, xs:integer) as xs:boolean?item()http://www.w3.org/2005/xpath-functionsfunction-lookuphttp://www.w3.org/2005/xpath-functionsQNamehttp://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionsfunction-lookuphttp://www.w3.org/2005/xpath-functionsQNamehttp://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamefunction pdfbox:do-until( + $input as item()*, + $action as function(item()*, xs:integer) as item()*, + $predicate as function(item()*, xs:integer) as xs:boolean? +) as item()* +{ + let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3) + return if(exists($fn)) + then $fn($input,$action,$predicate) + else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3) + return if(exists($hof)) + then $hof($predicate(?,0),$action(?,0),$input) + else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") + +} \ No newline at end of file diff --git a/docs/xqdoc/modules/F000001/xqparse.xml b/docs/xqdoc/modules/F000001/xqparse.xml new file mode 100644 index 0000000..d14bbdb --- /dev/null +++ b/docs/xqdoc/modules/F000001/xqparse.xml @@ -0,0 +1,443 @@ +xquery version '3.1'; +(:~ +A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , +requires pdfbox jars on classpath, i.e. in custom or xar +tested with pdfbox-app-3.0.5.jar +@see https://pdfbox.apache.org/download.cgi +@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/ +@author Andy Bunce 2025 +:) + +module namespace pdfbox="org.expkg_zone58.Pdfbox3"; + +declare namespace Loader ="java:org.apache.pdfbox.Loader"; +declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper"; +declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; +declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; +declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; +declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; +declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage"; +declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; +declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline"; +declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; +declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; +declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; +declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata"; +declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream"; + +declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + + +declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; +declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; +declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle"; + +declare namespace File ="java:java.io.File"; + + + +(:~ with-document pattern: open pdf,apply function, close pdf + creates a local pdfobject and ensures it is closed after use +e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) +:) +declare function pdfbox:with-pdf($src as xs:string, + $fn as function(item())as item()*) +as item()*{ + let $pdf:=pdfbox:open($src) + return try{ + $fn($pdf),pdfbox:close($pdf) + } catch *{ + pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) + } + +}; + + +(:~ open pdf using fetch:binary, returns pdf object :) +declare function pdfbox:open($pdfsrc as item()) +as item(){ +pdfbox:open($pdfsrc, map{}) +}; + +(:~ open pdf from file/url/binary, opts may have password , returns pdf object +@param $pdfsrc a fetchable url or filepath, or xs:base64Binary item +@param $opts options otionally with map {"password":} +:) +declare function pdfbox:open($pdfsrc as item(), $opts as map(*)) +as item(){ + try{ + + if($pdfsrc instance of xs:base64Binary) + then Loader:loadPDF( $pdfsrc,string($opts?password)) + else if(starts-with($pdfsrc,"http")) + then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) + else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) + +} catch *{ + let $loc:=if($pdfsrc instance of xs:base64Binary) + then "xs:base64Binary" + else $pdfsrc + return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) +} +}; + +(:~ The version of the PDF specification used by $pdf e.g "1.4" +returned as string to avoid float rounding issues + :) +declare function pdfbox:specification($pdf as item()) +as xs:string{ + PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() +}; + +(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) +declare function pdfbox:save($pdf as item(),$savepath as xs:string) +as xs:string{ + PDDocument:save($pdf, File:new($savepath)),$savepath +}; + +(:~ Create binary representation of $pdf object as xs:base64Binary :) +declare function pdfbox:binary($pdf as item()) +as xs:base64Binary{ + let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() + let $_:=PDDocument:save($pdf, $bytes) + return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) + =>convert:integers-to-base64() +}; + +(:~ Release any resources related to $pdf:) +declare function pdfbox:close($pdf as item()) +as empty-sequence(){ + (# db:wrapjava void #) { + PDDocument:close($pdf) + } +}; + +(:~ Number of pages in PDF:) +declare function pdfbox:number-of-pages($pdf as item()) +as xs:integer{ + PDDocument:getNumberOfPages($pdf) +}; + +(:~ Pdf page as image (zero is cover) +options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) +declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) +as xs:base64Binary{ + let $options:=map:merge(($options,map{"format":"jpg","scale":1})) + let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) + let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() + let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) + return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) + =>convert:integers-to-base64() + +}; + + +(:~ property access map + keys are property names, + values are sequences of functions to get property from $pdf object +:) +declare %private variable $pdfbox:property-map:=map{ + "pageCount": pdfbox:number-of-pages#1, + + "hasOutline": pdfbox:hasOutline#1, + + "hasLabels": pdfbox:hasLabels#1, + + "specification":pdfbox:specification#1, + + "title": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getTitle#1) , + + "author": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getAuthor#1 ), + + "creator": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getCreator#1), + + "producer": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getProducer#1), + + "subject": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getSubject#1), + + "keywords": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getKeywords#1), + + "creationDate": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getCreationDate#1, + pdfbox:gregToISO#1), + + "modificationDate": (PDDocument:getDocumentInformation#1, + PDDocumentInformation:getModificationDate#1, + pdfbox:gregToISO#1) +}; + +(:~ known property names sorted :) +declare function pdfbox:property-names() +as xs:string*{ + $pdfbox:property-map=>map:keys()=>sort() +}; + +(:~ return value of $property for $pdf :) +declare function pdfbox:property($pdf as item(),$property as xs:string) +as item()*{ + let $fns:= $pdfbox:property-map($property) + return if(exists($fns)) + then fold-left($fns, + $pdf, + function($result,$this as function(*)){$this($result)}) + else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) +}; + +(:~ summary CSV style info for all properties for $pdfpaths +:) +declare function pdfbox:report($pdfpaths as xs:string*) +as map(*){ + pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) +}; + +(:~ summary CSV style info for named properties for $pdfpaths +@see https://docs.basex.org/main/CSV_Functions#xquery +:) +declare function pdfbox:report($pdfpaths as item()*, $properties as xs:string*) +as map(*){ + map{"names": array{"path",$properties}, + + "records": for $path in $pdfpaths + let $name:=if($path instance of xs:base64Binary) then "binary" else $path + return try{ + let $pdf:=pdfbox:open($path) + return (fold-left($properties, + array{$name}, + function($result as array(*),$prop as xs:string){ + array:append($result, string(pdfbox:property($pdf, $prop)))} + ), pdfbox:close($pdf) + ) + } catch *{ + fold-left($properties, + array{$name}, + function($result as array(*),$prop as xs:string){ + array:append($result, "#ERROR")} + ) + } + + } +}; + +(:~ true if $pdf has an outline :) +declare function pdfbox:hasOutline($pdf as item()) +as xs:boolean{ + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getDocumentOutline() + =>exists() +}; + +(:~ true if $pdf has Labels :) +declare function pdfbox:hasLabels($pdf as item()) +as xs:boolean{ + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + =>exists() +}; + +(:~ XMP metadata as "RDF" document +@note usually rdf:RDF root, but sometimes x:xmpmeta +:) +declare function pdfbox:metadata($pdf as item()) +as document-node(element(*))? +{ + let $m:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getMetadata() + return if(exists($m)) + then + let $is:=PDMetadata:exportXMPMetadata($m) + return pdfbox:do-until( + map{"n":0,"data":""}, + + function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, + + function($output,$pos) { $output?n eq -1 } + )?data=>parse-xml() + else () +}; + +(:~ read next block from XMP stream :) +declare %private function pdfbox:read-stream($is,$read as xs:string) +as map(*){ + let $blen:=4096 + let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) + let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) + let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() + return map{"n":$n, "data": $read || $data} +}; + +(:~ outline for $pdf as map()* :) +declare function pdfbox:outline($pdf as item()) +as map(*)*{ + (# db:wrapjava some #) { + let $outline:= + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getDocumentOutline() + + return if(exists($outline)) + then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) + } +}; + +(:~ return bookmark info for children of $outlineItem as seq of maps :) +declare function pdfbox:outline($pdf as item(),$outlineItem as item()?) +as map(*)*{ + let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) + return map:get($find,"list") +}; + +(:~ BaseX bug 10.7? error if inlined in outline :) +declare %private function pdfbox:outline_($pdf as item(),$outlineItem as item()?) +as map(*){ + pdfbox:do-until( + + map{"list":(),"this":$outlineItem}, + + function($input,$pos ) { + let $bk:= pdfbox:bookmark($input?this,$pdf) + let $bk:= if($bk?hasChildren) + then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) + return map:merge(($bk,map:entry("children",$kids))) + else $bk + return map{ + "list": ($input?list, $bk), + "this": PDOutlineItem:getNextSibling($input?this)} + }, + + function($output,$pos) { empty($output?this) } + ) +}; + +(:~ PDF outline in xml format :) +declare function pdfbox:outline-xml($pdf as item()) +as element(outline)?{ + let $outline:=pdfbox:outline($pdf) + return if(exists($outline)) + then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline> + else () +}; + +(:~ recursive ouutline map to XML :) +declare %private function pdfbox:bookmark-xml($outline as map(*)*) +as element(bookmark)* +{ + $outline! + <bookmark title="{?title}" index="{?index}"> + {?children!pdfbox:bookmark-xml(.)} + </bookmark> +}; + +(:~ return bookmark info for $bookmark +@return map{index:..,title:..,hasChildren:..} +:) +declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) +as map(*) +{ + map{ + "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), + "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} + (:=>translate("�",""), :), + "hasChildren": PDOutlineItem:hasChildren($bookmark) + } +}; + + +(:~ pageIndex of $page in $pdf :) +declare function pdfbox:find-page( + $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), + $pdf as item()) +as item()? +{ + if(exists($page)) + then PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPages() + =>PDPageTree:indexOf($page) +}; + +(:~ Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) +@param $start first page to include +@param $end last page to include +:) +declare function pdfbox:extract-range($pdf as item(), + $start as xs:integer,$end as xs:integer) +as xs:base64Binary +{ + let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() + return (pdfbox:binary($a),pdfbox:close($a)) +}; + + +(:~ pageLabel for every page or empty if none +@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples +@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files +:) +declare function pdfbox:labels($pdf as item()) +as xs:string* +{ + let $pagelabels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return if(exists($pagelabels)) + then PDPageLabels:getLabelsByPageIndices($pagelabels) + else () +}; + +(:~ return text on $pageNo :) +declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer) +as xs:string{ + let $tStripper := (# db:wrapjava instance #) { + PDFTextStripper:new() + => PDFTextStripper:setStartPage($pageNo) + => PDFTextStripper:setEndPage($pageNo) + } + return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} +}; + +(:~ return size of $pageNo (zero based) +@result e.g. [0.0,0.0,168.0,239.52] + :) +declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) +as xs:string{ + PDDocument:getPage($pdf, $pageNo) + =>PDPage:getMediaBox() + =>PDRectangle:toString() +}; + +(:~ version of Apache Pdfbox in use e.g. "3.0.4" :) +declare function pdfbox:version() +as xs:string{ + Q{java:org.apache.pdfbox.util.Version}getVersion() +}; + +(:~ convert date :) +declare %private +function pdfbox:gregToISO($item as item()?) +as xs:string?{ + if(exists($item)) + then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() + else () +}; + +(:~ fn:do-until shim for BaseX 9+10 +if fn:do-until not found use hof:until, note: $pos always zero +:) +declare %private function pdfbox:do-until( + $input as item()*, + $action as function(item()*, xs:integer) as item()*, + $predicate as function(item()*, xs:integer) as xs:boolean? +) as item()* +{ + let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3) + return if(exists($fn)) + then $fn($input,$action,$predicate) + else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3) + return if(exists($hof)) + then $hof($predicate(?,0),$action(?,0),$input) + else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") + +}; + \ No newline at end of file diff --git a/docs/xqdoc/resources/base.css b/docs/xqdoc/resources/base.css new file mode 100644 index 0000000..ff84327 --- /dev/null +++ b/docs/xqdoc/resources/base.css @@ -0,0 +1,1153 @@ +/****************************************************************************** + * Style sheet for the W3C specifications * + * + * Special classes handled by this style sheet include: + * + * Indices + * - .toc for the Table of Contents (
                                      ) + * + for the section numbers + * - #toc for the Table of Contents (