diff --git a/.gitignore b/.gitignore index 5aaca2e..aa4ef6d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ data/ dist/ jars/* !jars/.gitignore +docs/xqdoc/ \ No newline at end of file diff --git a/docs/xqdoc/annotations.html b/docs/xqdoc/annotations.html deleted file mode 100644 index 8ca7cde..0000000 --- a/docs/xqdoc/annotations.html +++ /dev/null @@ -1,11 +0,0 @@ -src - xqDocA - xqDocA

- Project - src -  Annotations -

Summary

This project uses 1 annotation namespaces.

Related documents
ViewDescriptionFormat
reportIndex of sourcesxhtml
restxqSummary of REST interfacexhtml
importsSummary of import usagexhtml
imports-diagProject wide module imports as html mermaid class diagramhtml5
imports-diag.mmdProject wide module imports as a mermaid class diagramtext
xqdoca.xmlxqDocA run configuration report (XML)xml
xqdoc-validatevalidate generated xqdoc filesxml

Annotations

2.1 http://www.w3.org/2012/xquery

private
\ No newline at end of file diff --git a/docs/xqdoc/imports.html b/docs/xqdoc/imports.html deleted file mode 100644 index 8cee469..0000000 --- a/docs/xqdoc/imports.html +++ /dev/null @@ -1,9 +0,0 @@ -src - xqDocA - xqDocA

Project src -  Imports -

Summary

Lists all modules imported.

Related documents
ViewDescriptionFormat
reportIndex of sourcesxhtml
restxqSummary of REST interfacexhtml
imports-diagProject wide module imports as html mermaid class diagramhtml5
imports-diag.mmdProject wide module imports as a mermaid class diagramtext
annotationsSummary of XQuery annotation usexhtml
xqdoca.xmlxqDocA run configuration report (XML)xml
xqdoc-validatevalidate generated xqdoc filesxml

Imports (0)

\ No newline at end of file diff --git a/docs/xqdoc/index.html b/docs/xqdoc/index.html deleted file mode 100644 index 7f52373..0000000 --- a/docs/xqdoc/index.html +++ /dev/null @@ -1,14 +0,0 @@ -src - xqDocA - xqDocA

- Project src -  XQuery source documentation -

Summary

The project - src contains - 1 XQuery source files, and uses - 1 annotation namespaces. -

This document was built from source folder C:/Users/mrwhe/git/expkg-zone58/pdfbox/src/ on - Sunday, 1st June 2025.

Related documents
ViewDescriptionFormat
reportIndex of sourcesxhtml
restxqSummary of REST interfacexhtml
importsSummary of import usagexhtml
imports-diagProject wide module imports as html mermaid class diagramhtml5
imports-diag.mmdProject wide module imports as a mermaid class diagramtext
annotationsSummary of XQuery annotation usexhtml
xqdoca.xmlxqDocA run configuration report (XML)xml
xqdoc-validatevalidate generated xqdoc filesxml

XQuery Main (0)

None

XQuery Library (1)

UriPrefixDescriptionUseAMetrics
org.expkg_zone58.Pdfbox3pdfbox - -A BaseX 10.7+ interface to pdfbox 3.0 https...
0
Library
↖0
P
V#1
F#31

File view (1)

Annotation namespaces (1)

A total of 7 annotations are defined. -

http://www.w3.org/2012/xquery

private7
\ No newline at end of file diff --git a/docs/xqdoc/mermaid.html b/docs/xqdoc/mermaid.html deleted file mode 100644 index 115ad7f..0000000 --- a/docs/xqdoc/mermaid.html +++ /dev/null @@ -1,35 +0,0 @@ -Module imports diagram - xqDocA
--- -title: something here -config: - theme: base - securityLevel: loose ---- -classDiagram -direction TB - -class RESTXQ:::cssrest -class INVOKE:::cssmain -class TEST:::cssunit - -class pdfbox { << Pdfbox3.xqm >>} - - - -classDef cssrest fill:palegreen -classDef cssmain fill:powderblue -classDef cssunit fill:yellow - -link pdfbox "modules/F000001/index.html" "This is a tooltip for org.expkg_zone58.Pdfbox3" - - -
\ No newline at end of file diff --git a/docs/xqdoc/mermaid.mmd b/docs/xqdoc/mermaid.mmd deleted file mode 100644 index 05c5513..0000000 --- a/docs/xqdoc/mermaid.mmd +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: something here -config: - theme: base - securityLevel: loose ---- -classDiagram -direction TB - -class RESTXQ:::cssrest -class INVOKE:::cssmain -class TEST:::cssunit - -class pdfbox { << Pdfbox3.xqm >>} - - - -classDef cssrest fill:palegreen -classDef cssmain fill:powderblue -classDef cssunit fill:yellow - -link pdfbox "modules/F000001/index.html" "This is a tooltip for org.expkg_zone58.Pdfbox3" - - diff --git a/docs/xqdoc/modules/F000001/index.html b/docs/xqdoc/modules/F000001/index.html deleted file mode 100644 index 7331d1c..0000000 --- a/docs/xqdoc/modules/F000001/index.html +++ /dev/null @@ -1,842 +0,0 @@ -src - xqDocA - xqDocA

org.expkg_zone58.Pdfbox3  - library module
P

Summary

- -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , -requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.5.jar -
See also
Authors
  • Andy Bunce 2025
Custom
Related documents
ViewDescriptionFormat
xqdocxqDoc xml file from the source modulexml
xqparsexqparse xml file from the source modulexml

Imports

- This module is imported by - 0 modules. It imports - 0 modules. -

Variables

3.1 $pdfbox:property-map

Summary
-property access map -keys are property names, -values are sequences of functions to get property from $pdf object -
Type
References 14 functions from 3 modules
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getAuthor#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getCreationDate#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getCreator#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getKeywords#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getModificationDate#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getProducer#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getSubject#1
  • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getTitle#1
  • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentInformation#1
  • pdfbox:gregToISO#1
  • pdfbox:hasLabels#1
  • pdfbox:hasOutline#1
  • pdfbox:number-of-pages#1
  • pdfbox:specification#1
Annotations (1)
%private()
Source ( 35 lines)
variable $pdfbox:property-map:=map{
-  "pageCount": pdfbox:number-of-pages#1,
-
-  "hasOutline": pdfbox:hasOutline#1,
-
-  "hasLabels": pdfbox:hasLabels#1,
-
-  "specification":pdfbox:specification#1,
-
-  "title": (PDDocument:getDocumentInformation#1,
-            PDDocumentInformation:getTitle#1) ,
-
-  "author": (PDDocument:getDocumentInformation#1,
-             PDDocumentInformation:getAuthor#1 ),
-
-  "creator": (PDDocument:getDocumentInformation#1,
-              PDDocumentInformation:getCreator#1),
-
-  "producer": (PDDocument:getDocumentInformation#1,
-               PDDocumentInformation:getProducer#1),
-
-  "subject": (PDDocument:getDocumentInformation#1,
-              PDDocumentInformation:getSubject#1),
-
-  "keywords": (PDDocument:getDocumentInformation#1,
-               PDDocumentInformation:getKeywords#1),
-
-  "creationDate": (PDDocument:getDocumentInformation#1,
-                   PDDocumentInformation:getCreationDate#1,
-                   pdfbox:gregToISO#1),
-
-  "modificationDate":  (PDDocument:getDocumentInformation#1,
-                        PDDocumentInformation:getModificationDate#1,
-                        pdfbox:gregToISO#1)
-}

Functions

4.1 pdfbox:binary

Arities: #1

Summary
-Create binary representation of $pdf object as xs:base64Binary
Signatures
pdfbox:binary - ( - $pdf as item() ) as xs:base64Binary
Parameters
  • pdf as item()
Return
  • xs:base64Binary
Referenced by 1 functions from 1 modules
References 3 functions from 2 modules
  • {java:java.io.ByteArrayOutputStream}new#0
  • {java:java.io.ByteArrayOutputStream}toByteArray#1
  • {java:org.apache.pdfbox.pdmodel.PDDocument}save#2
Source ( 7 lines)
function pdfbox:binary($pdf as item())
-as xs:base64Binary{
-   let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
-   let $_:=PDDocument:save($pdf, $bytes)
-   return  Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
-         =>convert:integers-to-base64()
-}

4.2 pdfbox:bookmark

Arities: #2P

Summary
-return bookmark info for $bookmark -
Signatures
pdfbox:bookmark - ( - $bookmark as item(), $pdf as item() ) as map(*)
Parameters
  • bookmark as item()
  • pdf as item()
Return
  • map(*) map{index:..,title:..,hasChildren:..}
Referenced by 1 functions from 1 modules
References 3 functions from 1 modules
  • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}findDestinationPage#2
  • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getTitle#1
  • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}hasChildren#1
Annotations (1)
%private()
Source ( 10 lines)
function pdfbox:bookmark($bookmark as item(),$pdf as item())
-as map(*)
-{
- map{ 
-  "index":  PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf),
-  "title":  (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}
-  (:=>translate("�",""), :),
-  "hasChildren": PDOutlineItem:hasChildren($bookmark)
-  }
-}

4.3 pdfbox:bookmark-xml

Arities: #1P

Summary
-recursive ouutline map to XML
Signatures
pdfbox:bookmark-xml - ( - $outline as map(*)* ) as element(bookmark)*
Parameters
  • outline as map(*)*
Return
  • element(bookmark) *
Referenced by 2 functions from 1 modules
References 1 functions from 1 modules
Annotations (1)
%private()
Source ( 8 lines)
function pdfbox:bookmark-xml($outline as map(*)*)
-as element(bookmark)*
-{
-  $outline!
-  <bookmark title="{?title}" index="{?index}">
-    {?children!pdfbox:bookmark-xml(.)}
-  </bookmark>
-}

4.4 pdfbox:close

Arities: #1

Summary
-Release any resources related to $pdf
Signatures
pdfbox:close - ( - $pdf as item() ) as empty-sequence
Parameters
  • pdf as item()
Return
  • empty-sequence
Referenced by 3 functions from 1 modules
References 1 functions from 1 modules
  • {java:org.apache.pdfbox.pdmodel.PDDocument}close#1
Source ( 6 lines)
function pdfbox:close($pdf as item())
-as empty-sequence(){
-  (# db:wrapjava void #) {
-     PDDocument:close($pdf)
-  }
-}

4.5 pdfbox:do-until

Arities: #3P

Summary
-fn:do-until shim for BaseX 9+10 -if fn:do-until not found use hof:until, note: $pos always zero -
Signatures
pdfbox:do-until - ( - $input as item()*, $action as function(item()*, xs:integer) as item()*, $predicate as function(item()*, xs:integer) as xs:boolean? ) as item()*
Parameters
  • input as item()*
  • action as function(item()*, xs:integer) as item()*
  • predicate as function(item()*, xs:integer) as xs:boolean?
Return
  • item() *
Referenced by 2 functions from 1 modules
References 5 functions from 2 modules
  • {http://www.w3.org/2001/XMLSchema}QName#1
  • {http://www.w3.org/2005/xpath-functions}QName#2
  • {http://www.w3.org/2005/xpath-functions}error#2
  • {http://www.w3.org/2005/xpath-functions}exists#1
  • {http://www.w3.org/2005/xpath-functions}function-lookup#2
Annotations (1)
%private()
Source ( 15 lines)
function pdfbox:do-until(
- $input 	as item()*, 	
- $action 	as function(item()*, xs:integer) as item()*, 	
- $predicate 	as function(item()*, xs:integer) as xs:boolean? 	
-) as item()*
-{
-  let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3)
-  return if(exists($fn))
-         then $fn($input,$action,$predicate)
-         else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3)
-              return if(exists($hof))
-                      then $hof($predicate(?,0),$action(?,0),$input)
-                      else error(xs:QName('pdfbox:do-until'),"No implementation do-until found")
-
-}

4.6 pdfbox:extract-range

Arities: #3

Summary
-Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) -
Signatures
pdfbox:extract-range - ( - $pdf as item(), $start as xs:integer, $end as xs:integer ) as xs:base64Binary
Parameters
  • pdf as item()
  • start as xs:integer first page to include
  • end as xs:integer last page to include
Return
  • xs:base64Binary
Referenced by 0 functions from 0 modules
    References 3 functions from 2 modules
    Source ( 7 lines)
    function pdfbox:extract-range($pdf as item(), 
    -             $start as xs:integer,$end as xs:integer)
    -as xs:base64Binary
    -{
    -    let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract()
    -    return (pdfbox:binary($a),pdfbox:close($a)) 
    -}

    4.7 pdfbox:find-page

    Arities: #2

    Summary
    -pageIndex of $page in $pdf
    Signatures
    pdfbox:find-page - ( - $page as item()?, $pdf as item() ) as item()?
    Parameters
    • page as item()?
    • pdf as item()
    Return
    • item() ?
    Referenced by 0 functions from 0 modules
      References 2 functions from 2 modules
      • {http://www.w3.org/2005/xpath-functions}exists#1
      • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
      Source ( 10 lines)
      function pdfbox:find-page(
      -   $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
      -   $pdf as item())
      -as item()?
      -{
      -  if(exists($page))
      -  then PDDocument:getDocumentCatalog($pdf)
      -      =>PDDocumentCatalog:getPages()
      -      =>PDPageTree:indexOf($page)
      -}

      4.8 pdfbox:gregToISO

      Arities: #1P

      Summary
      -convert date
      Signatures
      pdfbox:gregToISO - ( - $item as item()? ) as xs:string?
      Parameters
      • item as item()?
      Return
      • xs:string ?
      Referenced by 0 functions from 0 modules
        References 2 functions from 2 modules
        • {http://www.w3.org/2005/xpath-functions}exists#1
        • {java:java.util.GregorianCalendar}toZonedDateTime#1
        Annotations (1)
        %private()
        Source ( 6 lines)
        function pdfbox:gregToISO($item as item()?)
        -as xs:string?{
        - if(exists($item))
        - then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
        - else ()
        -}

        4.9 pdfbox:hasLabels

        Arities: #1

        Summary
        -true if $pdf has Labels
        Signatures
        pdfbox:hasLabels - ( - $pdf as item() ) as xs:boolean
        Parameters
        • pdf as item()
        Return
        • xs:boolean
        Referenced by 0 functions from 0 modules
          References 1 functions from 1 modules
          • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
          Source ( 6 lines)
          function pdfbox:hasLabels($pdf as item())
          -as xs:boolean{
          -  PDDocument:getDocumentCatalog($pdf)
          -  =>PDDocumentCatalog:getPageLabels()
          -  =>exists()
          -}

          4.10 pdfbox:hasOutline

          Arities: #1

          Summary
          -true if $pdf has an outline
          Signatures
          pdfbox:hasOutline - ( - $pdf as item() ) as xs:boolean
          Parameters
          • pdf as item()
          Return
          • xs:boolean
          Referenced by 0 functions from 0 modules
            References 1 functions from 1 modules
            • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
            Source ( 6 lines)
            function pdfbox:hasOutline($pdf as item())
            -as xs:boolean{
            -  PDDocument:getDocumentCatalog($pdf)
            -  =>PDDocumentCatalog:getDocumentOutline()
            -  =>exists()
            -}

            4.11 pdfbox:labels

            Arities: #1

            Summary
            -pageLabel for every page or empty if none -
            Signatures
            pdfbox:labels - ( - $pdf as item() ) as xs:string*
            Parameters
            • pdf as item()
            Return
            • xs:string *
            Tags
            Referenced by 0 functions from 0 modules
              References 3 functions from 3 modules
              • {http://www.w3.org/2005/xpath-functions}exists#1
              • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
              • {java:org.apache.pdfbox.pdmodel.common.PDPageLabels}getLabelsByPageIndices#1
              Source ( 9 lines)
              function pdfbox:labels($pdf as item())
              -as xs:string*
              -{
              -  let $pagelabels:=PDDocument:getDocumentCatalog($pdf)
              -                   =>PDDocumentCatalog:getPageLabels()
              -  return if(exists($pagelabels))
              -         then PDPageLabels:getLabelsByPageIndices($pagelabels)
              -         else ()
              -}

              4.12 pdfbox:metadata

              Arities: #1

              Summary
              -XMP metadata as "RDF" document -
              Signatures
              pdfbox:metadata - ( - $pdf as item() ) as document-node(element(*))?
              Parameters
              • pdf as item()
              Return
              • document-node(element(*)) ?
              Tags
              • @note: - usually rdf:RDF root, but sometimes x:xmpmeta
              Referenced by 0 functions from 0 modules
                References 5 functions from 4 modules
                • {http://www.w3.org/2005/xpath-functions}exists#1
                • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
                • {java:org.apache.pdfbox.pdmodel.common.PDMetadata}exportXMPMetadata#1
                • pdfbox:do-until#3
                • pdfbox:read-stream#2
                Source ( 17 lines)
                function pdfbox:metadata($pdf as item())
                -as document-node(element(*))?
                -{
                -  let $m:=PDDocument:getDocumentCatalog($pdf)
                -         =>PDDocumentCatalog:getMetadata()
                -  return  if(exists($m))
                -          then 
                -              let $is:=PDMetadata:exportXMPMetadata($m)
                -              return pdfbox:do-until(
                -                        map{"n":0,"data":""},
                -
                -                        function($input,$pos ) {  pdfbox:read-stream($is,$input?data)},
                -
                -                        function($output,$pos) { $output?n eq -1 }     
                -                     )?data=>parse-xml()
                -          else ()
                -}

                4.13 pdfbox:number-of-pages

                Arities: #1

                Summary
                -Number of pages in PDF
                Signatures
                pdfbox:number-of-pages - ( - $pdf as item() ) as xs:integer
                Parameters
                • pdf as item()
                Return
                • xs:integer
                Referenced by 0 functions from 0 modules
                  References 1 functions from 1 modules
                  • {java:org.apache.pdfbox.pdmodel.PDDocument}getNumberOfPages#1
                  Source ( 4 lines)
                  function pdfbox:number-of-pages($pdf as item())
                  -as xs:integer{
                  -  PDDocument:getNumberOfPages($pdf)
                  -}

                  4.14 pdfbox:open

                  Arities: #1#2

                  Summary
                  -open pdf using fetch:binary, returns pdf object
                  Signatures
                  pdfbox:open - ( - $pdfsrc as item() ) as item()
                  pdfbox:open - ( - $pdfsrc as item(), $opts as map(*) ) as item()
                  Parameters
                  • pdfsrc as item() a fetchable url or filepath, or xs:base64Binary item
                  • opts as map(*) options otionally with map {"password":}
                  Return
                  • item()
                  Referenced by 3 functions from 1 modules
                  References 8 functions from 6 modules
                  • {http://basex.org/modules/fetch}binary#1
                  • {http://www.w3.org/2001/XMLSchema}QName#1
                  • {http://www.w3.org/2005/xpath-functions}error#2
                  • {http://www.w3.org/2005/xpath-functions}starts-with#2
                  • {http://www.w3.org/2005/xpath-functions}string#1
                  • {java:org.apache.pdfbox.Loader}loadPDF#2
                  • {java:org.apache.pdfbox.io.RandomAccessReadBufferedFile}new#1
                  • pdfbox:open#2
                  Source ( 21 lines)
                  function pdfbox:open($pdfsrc as item())
                  -as item(){
                  -pdfbox:open($pdfsrc, map{})
                  -}
                  function pdfbox:open($pdfsrc as item(), $opts as map(*))
                  -as item(){
                  -  try{
                  -
                  -      if($pdfsrc instance of xs:base64Binary)
                  -      then Loader:loadPDF( $pdfsrc,string($opts?password))
                  -      else if(starts-with($pdfsrc,"http"))
                  -           then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password))
                  -           else  Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password))
                  -
                  -} catch *{
                  -    let $loc:=if($pdfsrc instance of xs:base64Binary)
                  -              then "xs:base64Binary"
                  -              else $pdfsrc
                  -    return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description)
                  -}
                  -}

                  4.15 pdfbox:outline

                  Arities: #1#2

                  Summary
                  -outline for $pdf as map()*
                  Signatures
                  pdfbox:outline - ( - $pdf as item() ) as map(*)*
                  pdfbox:outline - ( - $pdf as item(), $outlineItem as item()? ) as map(*)*
                  Parameters
                  • pdf as item()
                  • outlineItem as item()?
                  Return
                  • map(*) *
                  Referenced by 3 functions from 1 modules
                  References 6 functions from 5 modules
                  • {http://www.w3.org/2005/xpath-functions/map}get#2
                  • {http://www.w3.org/2005/xpath-functions}exists#1
                  • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
                  • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getFirstChild#1
                  • pdfbox:outline#2
                  • pdfbox:outline_#2
                  Source ( 16 lines)
                  function pdfbox:outline($pdf as item())
                  -as map(*)*{
                  -  (# db:wrapjava some #) {
                  -  let $outline:=
                  -                PDDocument:getDocumentCatalog($pdf)
                  -                =>PDDocumentCatalog:getDocumentOutline()
                  - 
                  -  return  if(exists($outline))
                  -          then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) 
                  -  }
                  -}
                  function pdfbox:outline($pdf as item(),$outlineItem as item()?)
                  -as map(*)*{
                  -  let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem)
                  -  return map:get($find,"list")
                  -}

                  4.16 pdfbox:outline-xml

                  Arities: #1

                  Summary
                  -PDF outline in xml format
                  Signatures
                  pdfbox:outline-xml - ( - $pdf as item() ) as element(outline)?
                  Parameters
                  • pdf as item()
                  Return
                  • element(outline) ?
                  Referenced by 0 functions from 0 modules
                    References 3 functions from 2 modules
                    Source ( 7 lines)
                    function pdfbox:outline-xml($pdf as item())
                    -as element(outline)?{
                    - let $outline:=pdfbox:outline($pdf)
                    -  return if(exists($outline))
                    -         then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline>
                    -         else ()
                    -}

                    4.17 pdfbox:outline_

                    Arities: #2P

                    Summary
                    -BaseX bug 10.7? error if inlined in outline
                    Signatures
                    pdfbox:outline_ - ( - $pdf as item(), $outlineItem as item()? ) as map(*)
                    Parameters
                    • pdf as item()
                    • outlineItem as item()?
                    Return
                    • map(*)
                    Referenced by 1 functions from 1 modules
                    References 8 functions from 4 modules
                    • {http://www.w3.org/2005/xpath-functions/map}entry#2
                    • {http://www.w3.org/2005/xpath-functions/map}merge#1
                    • {http://www.w3.org/2005/xpath-functions}empty#1
                    • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getFirstChild#1
                    • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getNextSibling#1
                    • pdfbox:bookmark#2
                    • pdfbox:do-until#3
                    • pdfbox:outline#2
                    Annotations (1)
                    %private()
                    Source ( 20 lines)
                    function pdfbox:outline_($pdf as item(),$outlineItem as item()?)
                    -as map(*){
                    -  pdfbox:do-until(
                    -    
                    -     map{"list":(),"this":$outlineItem},
                    -
                    -     function($input,$pos ) { 
                    -        let $bk:= pdfbox:bookmark($input?this,$pdf)
                    -        let $bk:= if($bk?hasChildren)
                    -                  then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this))
                    -                        return map:merge(($bk,map:entry("children",$kids)))
                    -                  else $bk 
                    -        return map{
                    -              "list": ($input?list, $bk),
                    -              "this":  PDOutlineItem:getNextSibling($input?this)}
                    -      },
                    -
                    -     function($output,$pos) { empty($output?this) }                      
                    -  )
                    -}

                    4.18 pdfbox:page-media-box

                    Arities: #2

                    Summary
                    -return size of $pageNo (zero based) -
                    Signatures
                    pdfbox:page-media-box - ( - $pdf as item(), $pageNo as xs:integer ) as xs:string
                    Parameters
                    • pdf as item()
                    • pageNo as xs:integer
                    Return
                    • xs:string
                    Tags
                    • @result: - e.g. [0.0,0.0,168.0,239.52]
                    Referenced by 0 functions from 0 modules
                      References 1 functions from 1 modules
                      • {java:org.apache.pdfbox.pdmodel.PDDocument}getPage#2
                      Source ( 6 lines)
                      function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer)
                      -as xs:string{
                      -  PDDocument:getPage($pdf, $pageNo)
                      -  =>PDPage:getMediaBox()
                      -  =>PDRectangle:toString()
                      -}

                      4.19 pdfbox:page-render

                      Arities: #3

                      Summary
                      -Pdf page as image (zero is cover) -options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi??
                      Signatures
                      pdfbox:page-render - ( - $pdf as item(), $pageNo as xs:integer, $options as map(*) ) as xs:base64Binary
                      Parameters
                      • pdf as item()
                      • pageNo as xs:integer
                      • options as map(*)
                      Return
                      • xs:base64Binary
                      Referenced by 0 functions from 0 modules
                        References 5 functions from 4 modules
                        • {http://www.w3.org/2005/xpath-functions/map}merge#1
                        • {java:java.io.ByteArrayOutputStream}new#0
                        • {java:java.io.ByteArrayOutputStream}toByteArray#1
                        • {java:javax.imageio.ImageIO}write#3
                        • {java:org.apache.pdfbox.rendering.PDFRenderer}new#1
                        Source ( 10 lines)
                        function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*))
                        -as xs:base64Binary{
                        -  let $options:=map:merge(($options,map{"format":"jpg","scale":1}))
                        -  let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale)
                        -  let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
                        -  let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format,  $bytes)
                        -  return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
                        -         =>convert:integers-to-base64()
                        - 
                        -}

                        4.20 pdfbox:page-text

                        Arities: #2

                        Summary
                        -return text on $pageNo
                        Signatures
                        pdfbox:page-text - ( - $pdf as item(), $pageNo as xs:integer ) as xs:string
                        Parameters
                        • pdf as item()
                        • pageNo as xs:integer
                        Return
                        • xs:string
                        Referenced by 0 functions from 0 modules
                          References 2 functions from 1 modules
                          • {java:org.apache.pdfbox.text.PDFTextStripper}getText#2
                          • {java:org.apache.pdfbox.text.PDFTextStripper}new#0
                          Source ( 9 lines)
                          function pdfbox:page-text($pdf as item(), $pageNo as xs:integer)
                          -as xs:string{
                          -  let $tStripper := (# db:wrapjava instance #) {
                          -         PDFTextStripper:new()
                          -         => PDFTextStripper:setStartPage($pageNo)
                          -         => PDFTextStripper:setEndPage($pageNo)
                          -       }
                          -  return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
                          -}

                          4.21 pdfbox:property

                          Arities: #2

                          Summary
                          -return value of $property for $pdf
                          Signatures
                          pdfbox:property - ( - $pdf as item(), $property as xs:string ) as item()*
                          Parameters
                          • pdf as item()
                          • property as xs:string
                          Return
                          • item() *
                          Referenced by 1 functions from 1 modules
                          References 5 functions from 2 modules
                          • {http://www.w3.org/2001/XMLSchema}QName#1
                          • {http://www.w3.org/2005/xpath-functions}concat#3
                          • {http://www.w3.org/2005/xpath-functions}error#2
                          • {http://www.w3.org/2005/xpath-functions}exists#1
                          • {http://www.w3.org/2005/xpath-functions}fold-left#3
                          Source ( 9 lines)
                          function pdfbox:property($pdf as item(),$property as xs:string)
                          -as item()*{
                          -  let $fns:= $pdfbox:property-map($property)
                          -  return if(exists($fns))
                          -         then fold-left($fns, 
                          -                        $pdf, 
                          -                        function($result,$this as function(*)){$this($result)})
                          -         else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined."))
                          -}

                          4.22 pdfbox:property-names

                          Arities: #0

                          Summary
                          -known property names sorted
                          Signatures
                          pdfbox:property-names - ( - ) as xs:string*
                          Return
                          • xs:string *
                          Referenced by 0 functions from 0 modules
                            Source ( 4 lines)
                            function pdfbox:property-names() 
                            -as xs:string*{
                            -  $pdfbox:property-map=>map:keys()=>sort()
                            -}

                            4.23 pdfbox:read-stream

                            Arities: #2P

                            Summary
                            -read next block from XMP stream
                            Signatures
                            pdfbox:read-stream - ( - $is, $read as xs:string ) as map(*)
                            Parameters
                            • is as 
                            • read as xs:string
                            Return
                            • map(*)
                            Referenced by 1 functions from 1 modules
                            References 6 functions from 5 modules
                            • {http://basex.org/modules/convert}integers-to-base64#1
                            • {http://www.w3.org/2001/XMLSchema}byte#1
                            • {http://www.w3.org/2001/XMLSchema}int#1
                            • {http://www.w3.org/2005/xpath-functions}subsequence#3
                            • {java:java.util.Arrays}copyOf#2
                            • {java:org.apache.pdfbox.cos.COSInputStream}read#4
                            Annotations (1)
                            %private()
                            Source ( 8 lines)
                            function pdfbox:read-stream($is,$read as xs:string)
                            -as map(*){
                            -  let $blen:=4096
                            -  let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen)
                            -  let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen))
                            -  let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string()
                            -  return map{"n":$n, "data": $read || $data}
                            -}

                            4.24 pdfbox:report

                            Arities: #1#2

                            Summary
                            -summary CSV style info for all properties for $pdfpaths -
                            Signatures
                            pdfbox:report - ( - $pdfpaths as xs:string* ) as map(*)
                            pdfbox:report - ( - $pdfpaths as item()*, $properties as xs:string* ) as map(*)
                            Parameters
                            • pdfpaths as item()*
                            • properties as xs:string*
                            Return
                            • map(*)
                            Tags
                            Referenced by 1 functions from 1 modules
                            References 8 functions from 4 modules
                            Source ( 28 lines)
                            function pdfbox:report($pdfpaths as xs:string*)
                            -as map(*){
                            - pdfbox:report($pdfpaths,map:keys($pdfbox:property-map))
                            -}
                            function pdfbox:report($pdfpaths as item()*, $properties as xs:string*)
                            -as map(*){
                            -  map{"names":   array{"path",$properties},
                            -  
                            -      "records": for $path in $pdfpaths
                            -                 let $name:=if($path instance of xs:base64Binary) then "binary" else $path
                            -                 return try{
                            -                  let $pdf:=pdfbox:open($path)
                            -                  return (fold-left($properties,
                            -                                  array{$name},
                            -                                  function($result as array(*),$prop as xs:string){
                            -                                    array:append($result, string(pdfbox:property($pdf, $prop)))}
                            -                         ), pdfbox:close($pdf)
                            -                         )
                            -                 } catch *{
                            -                      fold-left($properties,
                            -                                array{$name},
                            -                                function($result as array(*),$prop as xs:string){
                            -                                    array:append($result, "#ERROR")}
                            -                               )
                            -                 }
                            -               
                            -  }
                            -}

                            4.25 pdfbox:save

                            Arities: #2

                            Summary
                            -Save pdf $pdf to filesystem at $savepath , returns $savepath
                            Signatures
                            pdfbox:save - ( - $pdf as item(), $savepath as xs:string ) as xs:string
                            Parameters
                            • pdf as item()
                            • savepath as xs:string
                            Return
                            • xs:string
                            Referenced by 0 functions from 0 modules
                              References 2 functions from 2 modules
                              • {java:java.io.File}new#1
                              • {java:org.apache.pdfbox.pdmodel.PDDocument}save#2
                              Source ( 4 lines)
                              function pdfbox:save($pdf as item(),$savepath as xs:string)
                              -as xs:string{
                              -   PDDocument:save($pdf, File:new($savepath)),$savepath
                              -}

                              4.26 pdfbox:specification

                              Arities: #1

                              Summary
                              -The version of the PDF specification used by $pdf e.g "1.4" -returned as string to avoid float rounding issues -
                              Signatures
                              pdfbox:specification - ( - $pdf as item() ) as xs:string
                              Parameters
                              • pdf as item()
                              Return
                              • xs:string
                              Referenced by 0 functions from 0 modules
                                References 1 functions from 1 modules
                                • {java:org.apache.pdfbox.pdmodel.PDDocument}getVersion#1
                                Source ( 4 lines)
                                function pdfbox:specification($pdf as item())
                                -as xs:string{
                                - PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string()
                                -}

                                4.27 pdfbox:version

                                Arities: #0

                                Summary
                                -version of Apache Pdfbox in use e.g. "3.0.4"
                                Signatures
                                pdfbox:version - ( - ) as xs:string
                                Return
                                • xs:string
                                Referenced by 0 functions from 0 modules
                                  References 1 functions from 1 modules
                                  • {java:org.apache.pdfbox.util.Version}getVersion#0
                                  Source ( 4 lines)
                                  function pdfbox:version()
                                  -as xs:string{
                                  -  Q{java:org.apache.pdfbox.util.Version}getVersion()
                                  -}

                                  4.28 pdfbox:with-pdf

                                  Arities: #2

                                  Summary
                                  -with-document pattern: open pdf,apply function, close pdf -creates a local pdfobject and ensures it is closed after use -e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) -
                                  Signatures
                                  pdfbox:with-pdf - ( - $src as xs:string, $fn as function(item())as item()* ) as item()*
                                  Parameters
                                  • src as xs:string
                                  • fn as function(item())as item()*
                                  Return
                                  • item() *
                                  Referenced by 0 functions from 0 modules
                                    References 3 functions from 2 modules
                                    Source ( 11 lines)
                                    function pdfbox:with-pdf($src as xs:string,
                                    -                                $fn as function(item())as item()*)
                                    -as item()*{
                                    - let $pdf:=pdfbox:open($src)
                                    - return try{
                                    -        $fn($pdf),pdfbox:close($pdf)
                                    -        } catch *{
                                    -          pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description)
                                    -        }
                                    -
                                    -}

                                    Namespaces

                                    The following namespaces are defined:

                                    Prefix -Uri -
                                    arrayhttp://www.w3.org/2005/xpath-functions/array
                                    converthttp://basex.org/modules/convert
                                    COSInputStreamjava:org.apache.pdfbox.cos.COSInputStream
                                    dbhttp://basex.org/modules/db
                                    errhttp://www.w3.org/2005/xqt-errors
                                    fetchhttp://basex.org/modules/fetch
                                    Filejava:java.io.File
                                    fnhttp://www.w3.org/2005/xpath-functions
                                    Loaderjava:org.apache.pdfbox.Loader
                                    maphttp://www.w3.org/2005/xpath-functions/map
                                    PageExtractorjava:org.apache.pdfbox.multipdf.PageExtractor
                                    PDDocumentjava:org.apache.pdfbox.pdmodel.PDDocument
                                    PDDocumentCatalogjava:org.apache.pdfbox.pdmodel.PDDocumentCatalog
                                    PDDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformation
                                    PDDocumentOutlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline
                                    pdfboxorg.expkg_zone58.Pdfbox3
                                    PDFRendererjava:org.apache.pdfbox.rendering.PDFRenderer
                                    PDFTextStripperjava:org.apache.pdfbox.text.PDFTextStripper
                                    PDMetadatajava:org.apache.pdfbox.pdmodel.common.PDMetadata
                                    PDOutlineItemjava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem
                                    PDPagejava:org.apache.pdfbox.pdmodel.PDPage
                                    PDPageLabelsjava:org.apache.pdfbox.pdmodel.common.PDPageLabels
                                    PDPageTreejava:org.apache.pdfbox.pdmodel.PDPageTree
                                    PDRectangleorg.apache.pdfbox.pdmodel.common.PDRectangle
                                    RandomAccessReadBufferjava:org.apache.pdfbox.io.RandomAccessReadBuffer
                                    RandomAccessReadBufferedFilejava:org.apache.pdfbox.io.RandomAccessReadBufferedFile
                                    rdfhttp://www.w3.org/1999/02/22-rdf-syntax-ns#
                                    xshttp://www.w3.org/2001/XMLSchema

                                    6 RestXQ

                                    None

                                    Source Code

                                    xquery version '3.1';
                                    -(:~ 
                                    -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , 
                                    -requires pdfbox jars on classpath, i.e. in custom or xar
                                    -tested with pdfbox-app-3.0.5.jar
                                    -@see https://pdfbox.apache.org/download.cgi
                                    -@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/
                                    -@author Andy Bunce 2025
                                    -:)
                                    -
                                    -module namespace pdfbox="org.expkg_zone58.Pdfbox3";
                                    -
                                    -declare namespace Loader ="java:org.apache.pdfbox.Loader"; 
                                    -declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper";
                                    -declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument";
                                    -declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog";
                                    -declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels";
                                    -declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor";
                                    -declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage";
                                    -declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree";
                                    -declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline";
                                    -declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
                                    -declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
                                    -declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
                                    -declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata";
                                    -declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream";
                                    -
                                    -declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
                                    -
                                    -
                                    -declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer";
                                    -declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
                                    -declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle";
                                    -
                                    -declare namespace File ="java:java.io.File";
                                    -
                                    -
                                    -
                                    -(:~ with-document pattern: open pdf,apply function, close pdf
                                    - creates a local pdfobject and ensures it is closed after use
                                    -e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
                                    -:)
                                    -declare function pdfbox:with-pdf($src as xs:string,
                                    -                                $fn as function(item())as item()*)
                                    -as item()*{
                                    - let $pdf:=pdfbox:open($src)
                                    - return try{
                                    -        $fn($pdf),pdfbox:close($pdf)
                                    -        } catch *{
                                    -          pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description)
                                    -        }
                                    -
                                    -};
                                    -
                                    -
                                    -(:~ open pdf using fetch:binary, returns pdf object :)
                                    -declare function pdfbox:open($pdfsrc as item())
                                    -as item(){
                                    -pdfbox:open($pdfsrc, map{})
                                    -};
                                    -
                                    -(:~ open pdf from file/url/binary, opts may have password , returns pdf object 
                                    -@param $pdfsrc a fetchable url or filepath, or xs:base64Binary item
                                    -@param $opts options otionally with map {"password":} 
                                    -:)
                                    -declare function pdfbox:open($pdfsrc as item(), $opts as map(*))
                                    -as item(){
                                    -  try{
                                    -
                                    -      if($pdfsrc instance of xs:base64Binary)
                                    -      then Loader:loadPDF( $pdfsrc,string($opts?password))
                                    -      else if(starts-with($pdfsrc,"http"))
                                    -           then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password))
                                    -           else  Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password))
                                    -
                                    -} catch *{
                                    -    let $loc:=if($pdfsrc instance of xs:base64Binary)
                                    -              then "xs:base64Binary"
                                    -              else $pdfsrc
                                    -    return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description)
                                    -}
                                    -};
                                    -
                                    -(:~ The version of the PDF specification used by $pdf  e.g "1.4"
                                    -returned as string to avoid float rounding issues
                                    - :)
                                    -declare function pdfbox:specification($pdf as item())
                                    -as xs:string{
                                    - PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string()
                                    -};
                                    -
                                    -(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :)
                                    -declare function pdfbox:save($pdf as item(),$savepath as xs:string)
                                    -as xs:string{
                                    -   PDDocument:save($pdf, File:new($savepath)),$savepath
                                    -};
                                    -
                                    -(:~ Create binary representation of $pdf object as xs:base64Binary :)
                                    -declare function pdfbox:binary($pdf as item())
                                    -as xs:base64Binary{
                                    -   let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
                                    -   let $_:=PDDocument:save($pdf, $bytes)
                                    -   return  Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
                                    -         =>convert:integers-to-base64()
                                    -};
                                    -
                                    -(:~ Release any resources related to $pdf:)
                                    -declare function pdfbox:close($pdf as item())
                                    -as empty-sequence(){
                                    -  (# db:wrapjava void #) {
                                    -     PDDocument:close($pdf)
                                    -  }
                                    -};
                                    -
                                    -(:~ Number of pages in PDF:)
                                    -declare function pdfbox:number-of-pages($pdf as item())
                                    -as xs:integer{
                                    -  PDDocument:getNumberOfPages($pdf)
                                    -};
                                    -
                                    -(:~ Pdf page as image (zero is cover)
                                    -options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :)
                                    -declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*))
                                    -as xs:base64Binary{
                                    -  let $options:=map:merge(($options,map{"format":"jpg","scale":1}))
                                    -  let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale)
                                    -  let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
                                    -  let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format,  $bytes)
                                    -  return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
                                    -         =>convert:integers-to-base64()
                                    - 
                                    -};
                                    -
                                    -
                                    -(:~ property access map
                                    -   keys are property names, 
                                    -   values are sequences of functions to get property from $pdf object
                                    -:)
                                    -declare %private variable $pdfbox:property-map:=map{
                                    -  "pageCount": pdfbox:number-of-pages#1,
                                    -
                                    -  "hasOutline": pdfbox:hasOutline#1,
                                    -
                                    -  "hasLabels": pdfbox:hasLabels#1,
                                    -
                                    -  "specification":pdfbox:specification#1,
                                    -
                                    -  "title": (PDDocument:getDocumentInformation#1,
                                    -            PDDocumentInformation:getTitle#1) ,
                                    -
                                    -  "author": (PDDocument:getDocumentInformation#1,
                                    -             PDDocumentInformation:getAuthor#1 ),
                                    -
                                    -  "creator": (PDDocument:getDocumentInformation#1,
                                    -              PDDocumentInformation:getCreator#1),
                                    -
                                    -  "producer": (PDDocument:getDocumentInformation#1,
                                    -               PDDocumentInformation:getProducer#1),
                                    -
                                    -  "subject": (PDDocument:getDocumentInformation#1,
                                    -              PDDocumentInformation:getSubject#1),
                                    -
                                    -  "keywords": (PDDocument:getDocumentInformation#1,
                                    -               PDDocumentInformation:getKeywords#1),
                                    -
                                    -  "creationDate": (PDDocument:getDocumentInformation#1,
                                    -                   PDDocumentInformation:getCreationDate#1,
                                    -                   pdfbox:gregToISO#1),
                                    -
                                    -  "modificationDate":  (PDDocument:getDocumentInformation#1,
                                    -                        PDDocumentInformation:getModificationDate#1,
                                    -                        pdfbox:gregToISO#1)
                                    -};
                                    -
                                    -(:~ known property names sorted :)
                                    -declare function pdfbox:property-names() 
                                    -as xs:string*{
                                    -  $pdfbox:property-map=>map:keys()=>sort()
                                    -};
                                    -
                                    -(:~  return value of $property for $pdf :)
                                    -declare function pdfbox:property($pdf as item(),$property as xs:string)
                                    -as item()*{
                                    -  let $fns:= $pdfbox:property-map($property)
                                    -  return if(exists($fns))
                                    -         then fold-left($fns, 
                                    -                        $pdf, 
                                    -                        function($result,$this as function(*)){$this($result)})
                                    -         else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined."))
                                    -};
                                    -
                                    -(:~ summary CSV style info for all properties for $pdfpaths 
                                    -:)
                                    -declare function pdfbox:report($pdfpaths as xs:string*)
                                    -as map(*){
                                    - pdfbox:report($pdfpaths,map:keys($pdfbox:property-map))
                                    -};
                                    -
                                    -(:~ summary CSV style info for named properties for $pdfpaths 
                                    -@see https://docs.basex.org/main/CSV_Functions#xquery
                                    -:)
                                    -declare function pdfbox:report($pdfpaths as item()*, $properties as xs:string*)
                                    -as map(*){
                                    -  map{"names":   array{"path",$properties},
                                    -  
                                    -      "records": for $path in $pdfpaths
                                    -                 let $name:=if($path instance of xs:base64Binary) then "binary" else $path
                                    -                 return try{
                                    -                  let $pdf:=pdfbox:open($path)
                                    -                  return (fold-left($properties,
                                    -                                  array{$name},
                                    -                                  function($result as array(*),$prop as xs:string){
                                    -                                    array:append($result, string(pdfbox:property($pdf, $prop)))}
                                    -                         ), pdfbox:close($pdf)
                                    -                         )
                                    -                 } catch *{
                                    -                      fold-left($properties,
                                    -                                array{$name},
                                    -                                function($result as array(*),$prop as xs:string){
                                    -                                    array:append($result, "#ERROR")}
                                    -                               )
                                    -                 }
                                    -               
                                    -  }
                                    -};
                                    -
                                    -(:~ true if $pdf has an outline :)
                                    -declare function pdfbox:hasOutline($pdf as item())
                                    -as xs:boolean{
                                    -  PDDocument:getDocumentCatalog($pdf)
                                    -  =>PDDocumentCatalog:getDocumentOutline()
                                    -  =>exists()
                                    -};
                                    -
                                    -(:~ true if $pdf has Labels :)
                                    -declare function pdfbox:hasLabels($pdf as item())
                                    -as xs:boolean{
                                    -  PDDocument:getDocumentCatalog($pdf)
                                    -  =>PDDocumentCatalog:getPageLabels()
                                    -  =>exists()
                                    -};
                                    -
                                    -(:~ XMP metadata as "RDF" document
                                    -@note usually rdf:RDF root, but sometimes x:xmpmeta 
                                    -:)
                                    -declare function pdfbox:metadata($pdf as item())
                                    -as document-node(element(*))?
                                    -{
                                    -  let $m:=PDDocument:getDocumentCatalog($pdf)
                                    -         =>PDDocumentCatalog:getMetadata()
                                    -  return  if(exists($m))
                                    -          then 
                                    -              let $is:=PDMetadata:exportXMPMetadata($m)
                                    -              return pdfbox:do-until(
                                    -                        map{"n":0,"data":""},
                                    -
                                    -                        function($input,$pos ) {  pdfbox:read-stream($is,$input?data)},
                                    -
                                    -                        function($output,$pos) { $output?n eq -1 }     
                                    -                     )?data=>parse-xml()
                                    -          else ()
                                    -};
                                    -
                                    -(:~ read next block from XMP stream :)
                                    -declare %private function pdfbox:read-stream($is,$read as xs:string)
                                    -as map(*){
                                    -  let $blen:=4096
                                    -  let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen)
                                    -  let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen))
                                    -  let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string()
                                    -  return map{"n":$n, "data": $read || $data}
                                    -};
                                    -
                                    -(:~ outline for $pdf as map()* :)
                                    -declare function pdfbox:outline($pdf as item())
                                    -as map(*)*{
                                    -  (# db:wrapjava some #) {
                                    -  let $outline:=
                                    -                PDDocument:getDocumentCatalog($pdf)
                                    -                =>PDDocumentCatalog:getDocumentOutline()
                                    - 
                                    -  return  if(exists($outline))
                                    -          then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) 
                                    -  }
                                    -};
                                    -
                                    -(:~ return bookmark info for children of $outlineItem as seq of maps :)
                                    -declare function pdfbox:outline($pdf as item(),$outlineItem as item()?)
                                    -as map(*)*{
                                    -  let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem)
                                    -  return map:get($find,"list")
                                    -};
                                    -
                                    -(:~ BaseX bug 10.7? error if inlined in outline :)
                                    -declare %private function pdfbox:outline_($pdf as item(),$outlineItem as item()?)
                                    -as map(*){
                                    -  pdfbox:do-until(
                                    -    
                                    -     map{"list":(),"this":$outlineItem},
                                    -
                                    -     function($input,$pos ) { 
                                    -        let $bk:= pdfbox:bookmark($input?this,$pdf)
                                    -        let $bk:= if($bk?hasChildren)
                                    -                  then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this))
                                    -                        return map:merge(($bk,map:entry("children",$kids)))
                                    -                  else $bk 
                                    -        return map{
                                    -              "list": ($input?list, $bk),
                                    -              "this":  PDOutlineItem:getNextSibling($input?this)}
                                    -      },
                                    -
                                    -     function($output,$pos) { empty($output?this) }                      
                                    -  )
                                    -};
                                    -
                                    -(:~ PDF outline in xml format :)
                                    -declare function pdfbox:outline-xml($pdf as item())
                                    -as element(outline)?{
                                    - let $outline:=pdfbox:outline($pdf)
                                    -  return if(exists($outline))
                                    -         then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline>
                                    -         else ()
                                    -};
                                    -
                                    -(:~ recursive ouutline map to XML :)
                                    -declare %private function pdfbox:bookmark-xml($outline as map(*)*)
                                    -as element(bookmark)*
                                    -{
                                    -  $outline!
                                    -  <bookmark title="{?title}" index="{?index}">
                                    -    {?children!pdfbox:bookmark-xml(.)}
                                    -  </bookmark>
                                    -};
                                    -
                                    -(:~ return bookmark info for $bookmark
                                    -@return map{index:..,title:..,hasChildren:..}
                                    -:)
                                    -declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item())
                                    -as map(*)
                                    -{
                                    - map{ 
                                    -  "index":  PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf),
                                    -  "title":  (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}
                                    -  (:=>translate("�",""), :),
                                    -  "hasChildren": PDOutlineItem:hasChildren($bookmark)
                                    -  }
                                    -};
                                    -
                                    -
                                    -(:~ pageIndex of $page in $pdf :)
                                    -declare function pdfbox:find-page(
                                    -   $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
                                    -   $pdf as item())
                                    -as item()?
                                    -{
                                    -  if(exists($page))
                                    -  then PDDocument:getDocumentCatalog($pdf)
                                    -      =>PDDocumentCatalog:getPages()
                                    -      =>PDPageTree:indexOf($page)
                                    -};            
                                    -
                                    -(:~  Return new  PDF doc with pages from $start to $end as xs:base64Binary, (1 based)  
                                    -@param $start first page to include
                                    -@param $end last page to include
                                    -:)
                                    -declare function pdfbox:extract-range($pdf as item(), 
                                    -             $start as xs:integer,$end as xs:integer)
                                    -as xs:base64Binary
                                    -{
                                    -    let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract()
                                    -    return (pdfbox:binary($a),pdfbox:close($a)) 
                                    -};
                                    -
                                    -
                                    -(:~   pageLabel for every page or empty if none
                                    -@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples
                                    -@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files
                                    -:)
                                    -declare function pdfbox:labels($pdf as item())
                                    -as xs:string*
                                    -{
                                    -  let $pagelabels:=PDDocument:getDocumentCatalog($pdf)
                                    -                   =>PDDocumentCatalog:getPageLabels()
                                    -  return if(exists($pagelabels))
                                    -         then PDPageLabels:getLabelsByPageIndices($pagelabels)
                                    -         else ()
                                    -};
                                    -
                                    -(:~ return text on $pageNo :)
                                    -declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer)
                                    -as xs:string{
                                    -  let $tStripper := (# db:wrapjava instance #) {
                                    -         PDFTextStripper:new()
                                    -         => PDFTextStripper:setStartPage($pageNo)
                                    -         => PDFTextStripper:setEndPage($pageNo)
                                    -       }
                                    -  return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
                                    -};
                                    -
                                    -(:~ return size of $pageNo (zero based)
                                    -@result e.g. [0.0,0.0,168.0,239.52]
                                    - :)
                                    -declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer)
                                    -as xs:string{
                                    -  PDDocument:getPage($pdf, $pageNo)
                                    -  =>PDPage:getMediaBox()
                                    -  =>PDRectangle:toString()
                                    -};
                                    -
                                    -(:~  version of Apache Pdfbox in use  e.g. "3.0.4" :)
                                    -declare function pdfbox:version()
                                    -as xs:string{
                                    -  Q{java:org.apache.pdfbox.util.Version}getVersion()
                                    -};
                                    -
                                    -(:~ convert date :)
                                    -declare %private
                                    -function pdfbox:gregToISO($item as item()?)
                                    -as xs:string?{
                                    - if(exists($item))
                                    - then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
                                    - else ()
                                    -};
                                    -
                                    -(:~ fn:do-until shim for BaseX 9+10 
                                    -if  fn:do-until not found use hof:until, note: $pos always zero
                                    -:)
                                    -declare %private function pdfbox:do-until(
                                    - $input 	as item()*, 	
                                    - $action 	as function(item()*, xs:integer) as item()*, 	
                                    - $predicate 	as function(item()*, xs:integer) as xs:boolean? 	
                                    -) as item()*
                                    -{
                                    -  let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3)
                                    -  return if(exists($fn))
                                    -         then $fn($input,$action,$predicate)
                                    -         else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3)
                                    -              return if(exists($hof))
                                    -                      then $hof($predicate(?,0),$action(?,0),$input)
                                    -                      else error(xs:QName('pdfbox:do-until'),"No implementation do-until found")
                                    -
                                    -};
                                    -
                                    \ No newline at end of file diff --git a/docs/xqdoc/modules/F000001/xqdoc.xml b/docs/xqdoc/modules/F000001/xqdoc.xml deleted file mode 100644 index 8509c11..0000000 --- a/docs/xqdoc/modules/F000001/xqdoc.xml +++ /dev/null @@ -1,774 +0,0 @@ -2025-06-01T21:16:07.687+01:001.1org.expkg_zone58.Pdfbox3pdfbox - -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , -requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.5.jar -Andy Bunce 2025https://pdfbox.apache.org/download.cgihttps://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/xquery version '3.1'; -(:~ -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , -requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.5.jar -@see https://pdfbox.apache.org/download.cgi -@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/ -@author Andy Bunce 2025 -:) - -module namespace pdfbox="org.expkg_zone58.Pdfbox3"; - -declare namespace Loader ="java:org.apache.pdfbox.Loader"; -declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper"; -declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; -declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; -declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; -declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; -declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage"; -declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; -declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline"; -declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; -declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; -declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; -declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata"; -declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream"; - -declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; - - -declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; -declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; -declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle"; - -declare namespace File ="java:java.io.File"; - - - -(:~ with-document pattern: open pdf,apply function, close pdf - creates a local pdfobject and ensures it is closed after use -e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) -:) -declare function pdfbox:with-pdf($src as xs:string, - $fn as function(item())as item()*) -as item()*{ - let $pdf:=pdfbox:open($src) - return try{ - $fn($pdf),pdfbox:close($pdf) - } catch *{ - pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) - } - -}; - - -(:~ open pdf using fetch:binary, returns pdf object :) -declare function pdfbox:open($pdfsrc as item()) -as item(){ -pdfbox:open($pdfsrc, map{}) -}; - -(:~ open pdf from file/url/binary, opts may have password , returns pdf object -@param $pdfsrc a fetchable url or filepath, or xs:base64Binary item -@param $opts options otionally with map {"password":} -:) -declare function pdfbox:open($pdfsrc as item(), $opts as map(*)) -as item(){ - try{ - - if($pdfsrc instance of xs:base64Binary) - then Loader:loadPDF( $pdfsrc,string($opts?password)) - else if(starts-with($pdfsrc,"http")) - then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) - else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) - -} catch *{ - let $loc:=if($pdfsrc instance of xs:base64Binary) - then "xs:base64Binary" - else $pdfsrc - return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) -} -}; - -(:~ The version of the PDF specification used by $pdf e.g "1.4" -returned as string to avoid float rounding issues - :) -declare function pdfbox:specification($pdf as item()) -as xs:string{ - PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() -}; - -(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) -declare function pdfbox:save($pdf as item(),$savepath as xs:string) -as xs:string{ - PDDocument:save($pdf, File:new($savepath)),$savepath -}; - -(:~ Create binary representation of $pdf object as xs:base64Binary :) -declare function pdfbox:binary($pdf as item()) -as xs:base64Binary{ - let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() - let $_:=PDDocument:save($pdf, $bytes) - return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) - =>convert:integers-to-base64() -}; - -(:~ Release any resources related to $pdf:) -declare function pdfbox:close($pdf as item()) -as empty-sequence(){ - (# db:wrapjava void #) { - PDDocument:close($pdf) - } -}; - -(:~ Number of pages in PDF:) -declare function pdfbox:number-of-pages($pdf as item()) -as xs:integer{ - PDDocument:getNumberOfPages($pdf) -}; - -(:~ Pdf page as image (zero is cover) -options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) -declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) -as xs:base64Binary{ - let $options:=map:merge(($options,map{"format":"jpg","scale":1})) - let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) - let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() - let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) - return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) - =>convert:integers-to-base64() - -}; - - -(:~ property access map - keys are property names, - values are sequences of functions to get property from $pdf object -:) -declare %private variable $pdfbox:property-map:=map{ - "pageCount": pdfbox:number-of-pages#1, - - "hasOutline": pdfbox:hasOutline#1, - - "hasLabels": pdfbox:hasLabels#1, - - "specification":pdfbox:specification#1, - - "title": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getTitle#1) , - - "author": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getAuthor#1 ), - - "creator": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getCreator#1), - - "producer": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getProducer#1), - - "subject": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getSubject#1), - - "keywords": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getKeywords#1), - - "creationDate": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getCreationDate#1, - pdfbox:gregToISO#1), - - "modificationDate": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getModificationDate#1, - pdfbox:gregToISO#1) -}; - -(:~ known property names sorted :) -declare function pdfbox:property-names() -as xs:string*{ - $pdfbox:property-map=>map:keys()=>sort() -}; - -(:~ return value of $property for $pdf :) -declare function pdfbox:property($pdf as item(),$property as xs:string) -as item()*{ - let $fns:= $pdfbox:property-map($property) - return if(exists($fns)) - then fold-left($fns, - $pdf, - function($result,$this as function(*)){$this($result)}) - else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) -}; - -(:~ summary CSV style info for all properties for $pdfpaths -:) -declare function pdfbox:report($pdfpaths as xs:string*) -as map(*){ - pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) -}; - -(:~ summary CSV style info for named properties for $pdfpaths -@see https://docs.basex.org/main/CSV_Functions#xquery -:) -declare function pdfbox:report($pdfpaths as item()*, $properties as xs:string*) -as map(*){ - map{"names": array{"path",$properties}, - - "records": for $path in $pdfpaths - let $name:=if($path instance of xs:base64Binary) then "binary" else $path - return try{ - let $pdf:=pdfbox:open($path) - return (fold-left($properties, - array{$name}, - function($result as array(*),$prop as xs:string){ - array:append($result, string(pdfbox:property($pdf, $prop)))} - ), pdfbox:close($pdf) - ) - } catch *{ - fold-left($properties, - array{$name}, - function($result as array(*),$prop as xs:string){ - array:append($result, "#ERROR")} - ) - } - - } -}; - -(:~ true if $pdf has an outline :) -declare function pdfbox:hasOutline($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getDocumentOutline() - =>exists() -}; - -(:~ true if $pdf has Labels :) -declare function pdfbox:hasLabels($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() - =>exists() -}; - -(:~ XMP metadata as "RDF" document -@note usually rdf:RDF root, but sometimes x:xmpmeta -:) -declare function pdfbox:metadata($pdf as item()) -as document-node(element(*))? -{ - let $m:=PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getMetadata() - return if(exists($m)) - then - let $is:=PDMetadata:exportXMPMetadata($m) - return pdfbox:do-until( - map{"n":0,"data":""}, - - function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, - - function($output,$pos) { $output?n eq -1 } - )?data=>parse-xml() - else () -}; - -(:~ read next block from XMP stream :) -declare %private function pdfbox:read-stream($is,$read as xs:string) -as map(*){ - let $blen:=4096 - let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) - let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) - let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() - return map{"n":$n, "data": $read || $data} -}; - -(:~ outline for $pdf as map()* :) -declare function pdfbox:outline($pdf as item()) -as map(*)*{ - (# db:wrapjava some #) { - let $outline:= - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getDocumentOutline() - - return if(exists($outline)) - then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) - } -}; - -(:~ return bookmark info for children of $outlineItem as seq of maps :) -declare function pdfbox:outline($pdf as item(),$outlineItem as item()?) -as map(*)*{ - let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) - return map:get($find,"list") -}; - -(:~ BaseX bug 10.7? error if inlined in outline :) -declare %private function pdfbox:outline_($pdf as item(),$outlineItem as item()?) -as map(*){ - pdfbox:do-until( - - map{"list":(),"this":$outlineItem}, - - function($input,$pos ) { - let $bk:= pdfbox:bookmark($input?this,$pdf) - let $bk:= if($bk?hasChildren) - then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) - return map:merge(($bk,map:entry("children",$kids))) - else $bk - return map{ - "list": ($input?list, $bk), - "this": PDOutlineItem:getNextSibling($input?this)} - }, - - function($output,$pos) { empty($output?this) } - ) -}; - -(:~ PDF outline in xml format :) -declare function pdfbox:outline-xml($pdf as item()) -as element(outline)?{ - let $outline:=pdfbox:outline($pdf) - return if(exists($outline)) - then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline> - else () -}; - -(:~ recursive ouutline map to XML :) -declare %private function pdfbox:bookmark-xml($outline as map(*)*) -as element(bookmark)* -{ - $outline! - <bookmark title="{?title}" index="{?index}"> - {?children!pdfbox:bookmark-xml(.)} - </bookmark> -}; - -(:~ return bookmark info for $bookmark -@return map{index:..,title:..,hasChildren:..} -:) -declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) -as map(*) -{ - map{ - "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), - "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} - (:=>translate("�",""), :), - "hasChildren": PDOutlineItem:hasChildren($bookmark) - } -}; - - -(:~ pageIndex of $page in $pdf :) -declare function pdfbox:find-page( - $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), - $pdf as item()) -as item()? -{ - if(exists($page)) - then PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPages() - =>PDPageTree:indexOf($page) -}; - -(:~ Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) -@param $start first page to include -@param $end last page to include -:) -declare function pdfbox:extract-range($pdf as item(), - $start as xs:integer,$end as xs:integer) -as xs:base64Binary -{ - let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() - return (pdfbox:binary($a),pdfbox:close($a)) -}; - - -(:~ pageLabel for every page or empty if none -@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples -@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files -:) -declare function pdfbox:labels($pdf as item()) -as xs:string* -{ - let $pagelabels:=PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() - return if(exists($pagelabels)) - then PDPageLabels:getLabelsByPageIndices($pagelabels) - else () -}; - -(:~ return text on $pageNo :) -declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer) -as xs:string{ - let $tStripper := (# db:wrapjava instance #) { - PDFTextStripper:new() - => PDFTextStripper:setStartPage($pageNo) - => PDFTextStripper:setEndPage($pageNo) - } - return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} -}; - -(:~ return size of $pageNo (zero based) -@result e.g. [0.0,0.0,168.0,239.52] - :) -declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) -as xs:string{ - PDDocument:getPage($pdf, $pageNo) - =>PDPage:getMediaBox() - =>PDRectangle:toString() -}; - -(:~ version of Apache Pdfbox in use e.g. "3.0.4" :) -declare function pdfbox:version() -as xs:string{ - Q{java:org.apache.pdfbox.util.Version}getVersion() -}; - -(:~ convert date :) -declare %private -function pdfbox:gregToISO($item as item()?) -as xs:string?{ - if(exists($item)) - then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() - else () -}; - -(:~ fn:do-until shim for BaseX 9+10 -if fn:do-until not found use hof:until, note: $pos always zero -:) -declare %private function pdfbox:do-until( - $input as item()*, - $action as function(item()*, xs:integer) as item()*, - $predicate as function(item()*, xs:integer) as xs:boolean? -) as item()* -{ - let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3) - return if(exists($fn)) - then $fn($input,$action,$predicate) - else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3) - return if(exists($hof)) - then $hof($predicate(?,0),$action(?,0),$input) - else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") - -}; -pdfbox:property-map -property access map -keys are property names, -values are sequences of functions to get property from $pdf object -org.expkg_zone58.Pdfbox3number-of-pagesorg.expkg_zone58.Pdfbox3hasOutlineorg.expkg_zone58.Pdfbox3hasLabelsorg.expkg_zone58.Pdfbox3specificationjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetTitlejava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetAuthorjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetCreatorjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetProducerjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetSubjectjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetKeywordsjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetCreationDateorg.expkg_zone58.Pdfbox3gregToISOjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetModificationDateorg.expkg_zone58.Pdfbox3gregToISOvariable $pdfbox:property-map:=map{ - "pageCount": pdfbox:number-of-pages#1, - - "hasOutline": pdfbox:hasOutline#1, - - "hasLabels": pdfbox:hasLabels#1, - - "specification":pdfbox:specification#1, - - "title": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getTitle#1) , - - "author": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getAuthor#1 ), - - "creator": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getCreator#1), - - "producer": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getProducer#1), - - "subject": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getSubject#1), - - "keywords": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getKeywords#1), - - "creationDate": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getCreationDate#1, - pdfbox:gregToISO#1), - - "modificationDate": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getModificationDate#1, - pdfbox:gregToISO#1) -} -with-document pattern: open pdf,apply function, close pdf -creates a local pdfobject and ensures it is closed after use -e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) -pdfbox:with-pdffunction pdfbox:with-pdf ( $src as xs:string, $fn as function(item())as item()* ) as item()* { let $pdf:=pdfbox:open($src) return try{ $fn($pdf),pdfbox:close($pdf) } catch *{ pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) } }srcxs:stringfnfunction(item())as item()*item()org.expkg_zone58.Pdfbox3openorg.expkg_zone58.Pdfbox3closeorg.expkg_zone58.Pdfbox3closehttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2005/xqt-errorscodehttp://www.w3.org/2005/xqt-errorsdescriptionfunction pdfbox:with-pdf($src as xs:string, - $fn as function(item())as item()*) -as item()*{ - let $pdf:=pdfbox:open($src) - return try{ - $fn($pdf),pdfbox:close($pdf) - } catch *{ - pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) - } - -} -open pdf using fetch:binary, returns pdf objectpdfbox:openfunction pdfbox:open ( $pdfsrc as item() ) as item() { pdfbox:open($pdfsrc, map{}) }pdfsrcitem()item()org.expkg_zone58.Pdfbox3openfunction pdfbox:open($pdfsrc as item()) -as item(){ -pdfbox:open($pdfsrc, map{}) -} -open pdf from file/url/binary, opts may have password , returns pdf object -$pdfsrc a fetchable url or filepath, or xs:base64Binary item$opts options otionally with map {"password":}pdfbox:openfunction pdfbox:open ( $pdfsrc as item(), $opts as map(*) ) as item() { try{ if($pdfsrc instance of xs:base64Binary) then Loader:loadPDF( $pdfsrc,string($opts?password)) else if(starts-with($pdfsrc,"http")) then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) } catch *{ let $loc:=if($pdfsrc instance of xs:base64Binary) then "xs:base64Binary" else $pdfsrc return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) } }pdfsrcitem()optsmap(*)item()java:org.apache.pdfbox.LoaderloadPDFhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionsstarts-withjava:org.apache.pdfbox.LoaderloadPDFhttp://basex.org/modules/fetchbinaryhttp://www.w3.org/2005/xpath-functionsstringjava:org.apache.pdfbox.LoaderloadPDFjava:org.apache.pdfbox.io.RandomAccessReadBufferedFilenewhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xqt-errorsdescriptionfunction pdfbox:open($pdfsrc as item(), $opts as map(*)) -as item(){ - try{ - - if($pdfsrc instance of xs:base64Binary) - then Loader:loadPDF( $pdfsrc,string($opts?password)) - else if(starts-with($pdfsrc,"http")) - then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) - else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) - -} catch *{ - let $loc:=if($pdfsrc instance of xs:base64Binary) - then "xs:base64Binary" - else $pdfsrc - return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) -} -} -The version of the PDF specification used by $pdf e.g "1.4" -returned as string to avoid float rounding issues -pdfbox:specificationfunction pdfbox:specification ( $pdf as item() ) as xs:string { PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() }pdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetVersionfunction pdfbox:specification($pdf as item()) -as xs:string{ - PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() -} -Save pdf $pdf to filesystem at $savepath , returns $savepathpdfbox:savefunction pdfbox:save ( $pdf as item(),$savepath as xs:string ) as xs:string { PDDocument:save($pdf, File:new($savepath)),$savepath }pdfitem()savepathxs:stringxs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentsavejava:java.io.Filenewfunction pdfbox:save($pdf as item(),$savepath as xs:string) -as xs:string{ - PDDocument:save($pdf, File:new($savepath)),$savepath -} -Create binary representation of $pdf object as xs:base64Binarypdfbox:binaryfunction pdfbox:binary ( $pdf as item() ) as xs:base64Binary { let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() let $_:=PDDocument:save($pdf, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() }pdfitem()xs:base64Binaryjava:java.io.ByteArrayOutputStreamnewjava:org.apache.pdfbox.pdmodel.PDDocumentsavejava:java.io.ByteArrayOutputStreamtoByteArrayfunction pdfbox:binary($pdf as item()) -as xs:base64Binary{ - let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() - let $_:=PDDocument:save($pdf, $bytes) - return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) - =>convert:integers-to-base64() -} -Release any resources related to $pdfpdfbox:closefunction pdfbox:close ( $pdf as item() ) as empty-sequence() { (# db:wrapjava void #) { PDDocument:close($pdf) } }pdfitem()empty-sequencejava:org.apache.pdfbox.pdmodel.PDDocumentclosefunction pdfbox:close($pdf as item()) -as empty-sequence(){ - (# db:wrapjava void #) { - PDDocument:close($pdf) - } -} -Number of pages in PDFpdfbox:number-of-pagesfunction pdfbox:number-of-pages ( $pdf as item() ) as xs:integer { PDDocument:getNumberOfPages($pdf) }pdfitem()xs:integerjava:org.apache.pdfbox.pdmodel.PDDocumentgetNumberOfPagesfunction pdfbox:number-of-pages($pdf as item()) -as xs:integer{ - PDDocument:getNumberOfPages($pdf) -} -Pdf page as image (zero is cover) -options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi??pdfbox:page-renderfunction pdfbox:page-render ( $pdf as item(),$pageNo as xs:integer,$options as map(*) ) as xs:base64Binary { let $options:=map:merge(($options,map{"format":"jpg","scale":1})) let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() }pdfitem()pageNoxs:integeroptionsmap(*)xs:base64Binaryhttp://www.w3.org/2005/xpath-functions/mapmergejava:org.apache.pdfbox.rendering.PDFRenderernewjava:java.io.ByteArrayOutputStreamnewjava:javax.imageio.ImageIOwritejava:java.io.ByteArrayOutputStreamtoByteArrayfunction pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) -as xs:base64Binary{ - let $options:=map:merge(($options,map{"format":"jpg","scale":1})) - let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) - let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() - let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) - return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) - =>convert:integers-to-base64() - -} -known property names sortedpdfbox:property-namesfunction pdfbox:property-names ( ) as xs:string* { $pdfbox:property-map=>map:keys()=>sort() }xs:stringorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:property-names() -as xs:string*{ - $pdfbox:property-map=>map:keys()=>sort() -} -return value of $property for $pdfpdfbox:propertyfunction pdfbox:property ( $pdf as item(),$property as xs:string ) as item()* { let $fns:= $pdfbox:property-map($property) return if(exists($fns)) then fold-left($fns, $pdf, function($result,$this as function(*)){$this($result)}) else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }pdfitem()propertyxs:stringitem()http://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xpath-functionsconcatorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:property($pdf as item(),$property as xs:string) -as item()*{ - let $fns:= $pdfbox:property-map($property) - return if(exists($fns)) - then fold-left($fns, - $pdf, - function($result,$this as function(*)){$this($result)}) - else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) -} -summary CSV style info for all properties for $pdfpaths -pdfbox:reportfunction pdfbox:report ( $pdfpaths as xs:string* ) as map(*) { pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) }pdfpathsxs:stringmap(*)org.expkg_zone58.Pdfbox3reporthttp://www.w3.org/2005/xpath-functions/mapkeysorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:report($pdfpaths as xs:string*) -as map(*){ - pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) -} -summary CSV style info for named properties for $pdfpaths -https://docs.basex.org/main/CSV_Functions#xquerypdfbox:reportfunction pdfbox:report ( $pdfpaths as item()*, $properties as xs:string* ) as map(*) { map{"names": array{"path",$properties}, "records": for $path in $pdfpaths let $name:=if($path instance of xs:base64Binary) then "binary" else $path return try{ let $pdf:=pdfbox:open($path) return (fold-left($properties, array{$name}, function($result as array(*),$prop as xs:string){ array:append($result, string(pdfbox:property($pdf, $prop)))} ), pdfbox:close($pdf) ) } catch *{ fold-left($properties, array{$name}, function($result as array(*),$prop as xs:string){ array:append($result, "#ERROR")} ) } } }pdfpathsitem()propertiesxs:stringmap(*)org.expkg_zone58.Pdfbox3openhttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functions/arrayappendhttp://www.w3.org/2005/xpath-functionsstringorg.expkg_zone58.Pdfbox3propertyorg.expkg_zone58.Pdfbox3closehttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functions/arrayappendfunction pdfbox:report($pdfpaths as item()*, $properties as xs:string*) -as map(*){ - map{"names": array{"path",$properties}, - - "records": for $path in $pdfpaths - let $name:=if($path instance of xs:base64Binary) then "binary" else $path - return try{ - let $pdf:=pdfbox:open($path) - return (fold-left($properties, - array{$name}, - function($result as array(*),$prop as xs:string){ - array:append($result, string(pdfbox:property($pdf, $prop)))} - ), pdfbox:close($pdf) - ) - } catch *{ - fold-left($properties, - array{$name}, - function($result as array(*),$prop as xs:string){ - array:append($result, "#ERROR")} - ) - } - - } -} -true if $pdf has an outlinepdfbox:hasOutlinefunction pdfbox:hasOutline ( $pdf as item() ) as xs:boolean { PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getDocumentOutline() =>exists() }pdfitem()xs:booleanjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:hasOutline($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getDocumentOutline() - =>exists() -} -true if $pdf has Labelspdfbox:hasLabelsfunction pdfbox:hasLabels ( $pdf as item() ) as xs:boolean { PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() =>exists() }pdfitem()xs:booleanjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:hasLabels($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() - =>exists() -} -XMP metadata as "RDF" document -usually rdf:RDF root, but sometimes x:xmpmetapdfbox:metadatafunction pdfbox:metadata ( $pdf as item() ) as document-node(element(*))? { let $m:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getMetadata() return if(exists($m)) then let $is:=PDMetadata:exportXMPMetadata($m) return pdfbox:do-until( map{"n":0,"data":""}, function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, function($output,$pos) { $output?n eq -1 } )?data=>parse-xml() else () }pdfitem()document-node(element(*))java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.common.PDMetadataexportXMPMetadataorg.expkg_zone58.Pdfbox3do-untilorg.expkg_zone58.Pdfbox3read-streamfunction pdfbox:metadata($pdf as item()) -as document-node(element(*))? -{ - let $m:=PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getMetadata() - return if(exists($m)) - then - let $is:=PDMetadata:exportXMPMetadata($m) - return pdfbox:do-until( - map{"n":0,"data":""}, - - function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, - - function($output,$pos) { $output?n eq -1 } - )?data=>parse-xml() - else () -} -read next block from XMP streampdfbox:read-streamfunction pdfbox:read-stream ( $is,$read as xs:string ) as map(*) { let $blen:=4096 let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() return map{"n":$n, "data": $read || $data} }isreadxs:stringmap(*)java:java.util.ArrayscopyOfhttp://www.w3.org/2001/XMLSchemabytejava:org.apache.pdfbox.cos.COSInputStreamreadhttp://www.w3.org/2001/XMLSchemainthttp://www.w3.org/2001/XMLSchemainthttp://basex.org/modules/convertintegers-to-base64http://www.w3.org/2005/xpath-functionssubsequencefunction pdfbox:read-stream($is,$read as xs:string) -as map(*){ - let $blen:=4096 - let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) - let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) - let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() - return map{"n":$n, "data": $read || $data} -} -outline for $pdf as map()*pdfbox:outlinefunction pdfbox:outline ( $pdf as item() ) as map(*)* { (# db:wrapjava some #) { let $outline:= PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getDocumentOutline() return if(exists($outline)) then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) } }pdfitem()map(*)java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsorg.expkg_zone58.Pdfbox3outlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetFirstChildfunction pdfbox:outline($pdf as item()) -as map(*)*{ - (# db:wrapjava some #) { - let $outline:= - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getDocumentOutline() - - return if(exists($outline)) - then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) - } -} -return bookmark info for children of $outlineItem as seq of mapspdfbox:outlinefunction pdfbox:outline ( $pdf as item(),$outlineItem as item()? ) as map(*)* { let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) return map:get($find,"list") }pdfitem()outlineItemitem()map(*)org.expkg_zone58.Pdfbox3outline_http://www.w3.org/2005/xpath-functions/mapgetfunction pdfbox:outline($pdf as item(),$outlineItem as item()?) -as map(*)*{ - let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) - return map:get($find,"list") -} -BaseX bug 10.7? error if inlined in outlinepdfbox:outline_function pdfbox:outline_ ( $pdf as item(),$outlineItem as item()? ) as map(*) { pdfbox:do-until( map{"list":(),"this":$outlineItem}, function($input,$pos ) { let $bk:= pdfbox:bookmark($input?this,$pdf) let $bk:= if($bk?hasChildren) then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) return map:merge(($bk,map:entry("children",$kids))) else $bk return map{ "list": ($input?list, $bk), "this": PDOutlineItem:getNextSibling($input?this)} }, function($output,$pos) { empty($output?this) } ) }pdfitem()outlineItemitem()map(*)org.expkg_zone58.Pdfbox3do-untilorg.expkg_zone58.Pdfbox3bookmarkorg.expkg_zone58.Pdfbox3outlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetFirstChildhttp://www.w3.org/2005/xpath-functions/mapmergehttp://www.w3.org/2005/xpath-functions/mapentryjava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetNextSiblinghttp://www.w3.org/2005/xpath-functionsemptyfunction pdfbox:outline_($pdf as item(),$outlineItem as item()?) -as map(*){ - pdfbox:do-until( - - map{"list":(),"this":$outlineItem}, - - function($input,$pos ) { - let $bk:= pdfbox:bookmark($input?this,$pdf) - let $bk:= if($bk?hasChildren) - then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) - return map:merge(($bk,map:entry("children",$kids))) - else $bk - return map{ - "list": ($input?list, $bk), - "this": PDOutlineItem:getNextSibling($input?this)} - }, - - function($output,$pos) { empty($output?this) } - ) -} -PDF outline in xml formatpdfbox:outline-xmlfunction pdfbox:outline-xml ( $pdf as item() ) as element(outline)? { let $outline:=pdfbox:outline($pdf) return if(exists($outline)) then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline> else () }pdfitem()element(outline)org.expkg_zone58.Pdfbox3outlinehttp://www.w3.org/2005/xpath-functionsexistsorg.expkg_zone58.Pdfbox3bookmark-xmlfunction pdfbox:outline-xml($pdf as item()) -as element(outline)?{ - let $outline:=pdfbox:outline($pdf) - return if(exists($outline)) - then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline> - else () -} -recursive ouutline map to XMLpdfbox:bookmark-xmlfunction pdfbox:bookmark-xml ( $outline as map(*)* ) as element(bookmark)* { $outline! <bookmark title="{?title}" index="{?index}"> {?children!pdfbox:bookmark-xml(.)} </bookmark> }outlinemap(*)element(bookmark)org.expkg_zone58.Pdfbox3bookmark-xmlfunction pdfbox:bookmark-xml($outline as map(*)*) -as element(bookmark)* -{ - $outline! - <bookmark title="{?title}" index="{?index}"> - {?children!pdfbox:bookmark-xml(.)} - </bookmark> -} -return bookmark info for $bookmark -map{index:..,title:..,hasChildren:..}pdfbox:bookmarkfunction pdfbox:bookmark ( $bookmark as item(),$pdf as item() ) as map(*) { map{ "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} (:=>translate("�",""), :), "hasChildren": PDOutlineItem:hasChildren($bookmark) } }bookmarkitem()pdfitem()map(*)java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemfindDestinationPagejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetTitlejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemhasChildrenfunction pdfbox:bookmark($bookmark as item(),$pdf as item()) -as map(*) -{ - map{ - "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), - "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} - (:=>translate("�",""), :), - "hasChildren": PDOutlineItem:hasChildren($bookmark) - } -} -pageIndex of $page in $pdfpdfbox:find-pagefunction pdfbox:find-page ( $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), $pdf as item() ) as item()? { if(exists($page)) then PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPages() =>PDPageTree:indexOf($page) }pageitem()pdfitem()item()http://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:find-page( - $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), - $pdf as item()) -as item()? -{ - if(exists($page)) - then PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPages() - =>PDPageTree:indexOf($page) -} -Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) -$start first page to include$end last page to includepdfbox:extract-rangefunction pdfbox:extract-range ( $pdf as item(), $start as xs:integer,$end as xs:integer ) as xs:base64Binary { let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() return (pdfbox:binary($a),pdfbox:close($a)) }pdfitem()startxs:integerendxs:integerxs:base64Binaryjava:org.apache.pdfbox.multipdf.PageExtractorneworg.expkg_zone58.Pdfbox3binaryorg.expkg_zone58.Pdfbox3closefunction pdfbox:extract-range($pdf as item(), - $start as xs:integer,$end as xs:integer) -as xs:base64Binary -{ - let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() - return (pdfbox:binary($a),pdfbox:close($a)) -} -pageLabel for every page or empty if none -https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-exampleshttps://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-filespdfbox:labelsfunction pdfbox:labels ( $pdf as item() ) as xs:string* { let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() return if(exists($pagelabels)) then PDPageLabels:getLabelsByPageIndices($pagelabels) else () }pdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetLabelsByPageIndicesfunction pdfbox:labels($pdf as item()) -as xs:string* -{ - let $pagelabels:=PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() - return if(exists($pagelabels)) - then PDPageLabels:getLabelsByPageIndices($pagelabels) - else () -} -return text on $pageNopdfbox:page-textfunction pdfbox:page-text ( $pdf as item(), $pageNo as xs:integer ) as xs:string { let $tStripper := (# db:wrapjava instance #) { PDFTextStripper:new() => PDFTextStripper:setStartPage($pageNo) => PDFTextStripper:setEndPage($pageNo) } return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} }pdfitem()pageNoxs:integerxs:stringjava:org.apache.pdfbox.text.PDFTextStrippernewjava:org.apache.pdfbox.text.PDFTextStrippergetTextfunction pdfbox:page-text($pdf as item(), $pageNo as xs:integer) -as xs:string{ - let $tStripper := (# db:wrapjava instance #) { - PDFTextStripper:new() - => PDFTextStripper:setStartPage($pageNo) - => PDFTextStripper:setEndPage($pageNo) - } - return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} -} -return size of $pageNo (zero based) -e.g. [0.0,0.0,168.0,239.52]pdfbox:page-media-boxfunction pdfbox:page-media-box ( $pdf as item(), $pageNo as xs:integer ) as xs:string { PDDocument:getPage($pdf, $pageNo) =>PDPage:getMediaBox() =>PDRectangle:toString() }pdfitem()pageNoxs:integerxs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetPagefunction pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) -as xs:string{ - PDDocument:getPage($pdf, $pageNo) - =>PDPage:getMediaBox() - =>PDRectangle:toString() -} -version of Apache Pdfbox in use e.g. "3.0.4"pdfbox:versionfunction pdfbox:version ( ) as xs:string { Q{java:org.apache.pdfbox.util.Version}getVersion() }xs:stringjava:org.apache.pdfbox.util.VersiongetVersionfunction pdfbox:version() -as xs:string{ - Q{java:org.apache.pdfbox.util.Version}getVersion() -} -convert datepdfbox:gregToISOfunction pdfbox:gregToISO ( $item as item()? ) as xs:string? { if(exists($item)) then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() else () }itemitem()xs:stringhttp://www.w3.org/2005/xpath-functionsexistsjava:java.util.GregorianCalendartoZonedDateTimefunction pdfbox:gregToISO($item as item()?) -as xs:string?{ - if(exists($item)) - then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() - else () -} -fn:do-until shim for BaseX 9+10 -if fn:do-until not found use hof:until, note: $pos always zero -pdfbox:do-untilfunction pdfbox:do-until ( $input as item()*, $action as function(item()*, xs:integer) as item()*, $predicate as function(item()*, xs:integer) as xs:boolean? ) as item()* { let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3) return if(exists($fn)) then $fn($input,$action,$predicate) else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3) return if(exists($hof)) then $hof($predicate(?,0),$action(?,0),$input) else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") }inputitem()actionfunction(item()*, xs:integer) as item()*predicatefunction(item()*, xs:integer) as xs:boolean?item()http://www.w3.org/2005/xpath-functionsfunction-lookuphttp://www.w3.org/2005/xpath-functionsQNamehttp://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionsfunction-lookuphttp://www.w3.org/2005/xpath-functionsQNamehttp://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamefunction pdfbox:do-until( - $input as item()*, - $action as function(item()*, xs:integer) as item()*, - $predicate as function(item()*, xs:integer) as xs:boolean? -) as item()* -{ - let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3) - return if(exists($fn)) - then $fn($input,$action,$predicate) - else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3) - return if(exists($hof)) - then $hof($predicate(?,0),$action(?,0),$input) - else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") - -} \ No newline at end of file diff --git a/docs/xqdoc/modules/F000001/xqparse.xml b/docs/xqdoc/modules/F000001/xqparse.xml deleted file mode 100644 index d14bbdb..0000000 --- a/docs/xqdoc/modules/F000001/xqparse.xml +++ /dev/null @@ -1,443 +0,0 @@ -xquery version '3.1'; -(:~ -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , -requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.5.jar -@see https://pdfbox.apache.org/download.cgi -@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/ -@author Andy Bunce 2025 -:) - -module namespace pdfbox="org.expkg_zone58.Pdfbox3"; - -declare namespace Loader ="java:org.apache.pdfbox.Loader"; -declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper"; -declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; -declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; -declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; -declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; -declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage"; -declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; -declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline"; -declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; -declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; -declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; -declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata"; -declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream"; - -declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; - - -declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; -declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; -declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle"; - -declare namespace File ="java:java.io.File"; - - - -(:~ with-document pattern: open pdf,apply function, close pdf - creates a local pdfobject and ensures it is closed after use -e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) -:) -declare function pdfbox:with-pdf($src as xs:string, - $fn as function(item())as item()*) -as item()*{ - let $pdf:=pdfbox:open($src) - return try{ - $fn($pdf),pdfbox:close($pdf) - } catch *{ - pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) - } - -}; - - -(:~ open pdf using fetch:binary, returns pdf object :) -declare function pdfbox:open($pdfsrc as item()) -as item(){ -pdfbox:open($pdfsrc, map{}) -}; - -(:~ open pdf from file/url/binary, opts may have password , returns pdf object -@param $pdfsrc a fetchable url or filepath, or xs:base64Binary item -@param $opts options otionally with map {"password":} -:) -declare function pdfbox:open($pdfsrc as item(), $opts as map(*)) -as item(){ - try{ - - if($pdfsrc instance of xs:base64Binary) - then Loader:loadPDF( $pdfsrc,string($opts?password)) - else if(starts-with($pdfsrc,"http")) - then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) - else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) - -} catch *{ - let $loc:=if($pdfsrc instance of xs:base64Binary) - then "xs:base64Binary" - else $pdfsrc - return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) -} -}; - -(:~ The version of the PDF specification used by $pdf e.g "1.4" -returned as string to avoid float rounding issues - :) -declare function pdfbox:specification($pdf as item()) -as xs:string{ - PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() -}; - -(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) -declare function pdfbox:save($pdf as item(),$savepath as xs:string) -as xs:string{ - PDDocument:save($pdf, File:new($savepath)),$savepath -}; - -(:~ Create binary representation of $pdf object as xs:base64Binary :) -declare function pdfbox:binary($pdf as item()) -as xs:base64Binary{ - let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() - let $_:=PDDocument:save($pdf, $bytes) - return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) - =>convert:integers-to-base64() -}; - -(:~ Release any resources related to $pdf:) -declare function pdfbox:close($pdf as item()) -as empty-sequence(){ - (# db:wrapjava void #) { - PDDocument:close($pdf) - } -}; - -(:~ Number of pages in PDF:) -declare function pdfbox:number-of-pages($pdf as item()) -as xs:integer{ - PDDocument:getNumberOfPages($pdf) -}; - -(:~ Pdf page as image (zero is cover) -options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) -declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) -as xs:base64Binary{ - let $options:=map:merge(($options,map{"format":"jpg","scale":1})) - let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) - let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() - let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) - return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) - =>convert:integers-to-base64() - -}; - - -(:~ property access map - keys are property names, - values are sequences of functions to get property from $pdf object -:) -declare %private variable $pdfbox:property-map:=map{ - "pageCount": pdfbox:number-of-pages#1, - - "hasOutline": pdfbox:hasOutline#1, - - "hasLabels": pdfbox:hasLabels#1, - - "specification":pdfbox:specification#1, - - "title": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getTitle#1) , - - "author": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getAuthor#1 ), - - "creator": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getCreator#1), - - "producer": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getProducer#1), - - "subject": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getSubject#1), - - "keywords": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getKeywords#1), - - "creationDate": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getCreationDate#1, - pdfbox:gregToISO#1), - - "modificationDate": (PDDocument:getDocumentInformation#1, - PDDocumentInformation:getModificationDate#1, - pdfbox:gregToISO#1) -}; - -(:~ known property names sorted :) -declare function pdfbox:property-names() -as xs:string*{ - $pdfbox:property-map=>map:keys()=>sort() -}; - -(:~ return value of $property for $pdf :) -declare function pdfbox:property($pdf as item(),$property as xs:string) -as item()*{ - let $fns:= $pdfbox:property-map($property) - return if(exists($fns)) - then fold-left($fns, - $pdf, - function($result,$this as function(*)){$this($result)}) - else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) -}; - -(:~ summary CSV style info for all properties for $pdfpaths -:) -declare function pdfbox:report($pdfpaths as xs:string*) -as map(*){ - pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) -}; - -(:~ summary CSV style info for named properties for $pdfpaths -@see https://docs.basex.org/main/CSV_Functions#xquery -:) -declare function pdfbox:report($pdfpaths as item()*, $properties as xs:string*) -as map(*){ - map{"names": array{"path",$properties}, - - "records": for $path in $pdfpaths - let $name:=if($path instance of xs:base64Binary) then "binary" else $path - return try{ - let $pdf:=pdfbox:open($path) - return (fold-left($properties, - array{$name}, - function($result as array(*),$prop as xs:string){ - array:append($result, string(pdfbox:property($pdf, $prop)))} - ), pdfbox:close($pdf) - ) - } catch *{ - fold-left($properties, - array{$name}, - function($result as array(*),$prop as xs:string){ - array:append($result, "#ERROR")} - ) - } - - } -}; - -(:~ true if $pdf has an outline :) -declare function pdfbox:hasOutline($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getDocumentOutline() - =>exists() -}; - -(:~ true if $pdf has Labels :) -declare function pdfbox:hasLabels($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() - =>exists() -}; - -(:~ XMP metadata as "RDF" document -@note usually rdf:RDF root, but sometimes x:xmpmeta -:) -declare function pdfbox:metadata($pdf as item()) -as document-node(element(*))? -{ - let $m:=PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getMetadata() - return if(exists($m)) - then - let $is:=PDMetadata:exportXMPMetadata($m) - return pdfbox:do-until( - map{"n":0,"data":""}, - - function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, - - function($output,$pos) { $output?n eq -1 } - )?data=>parse-xml() - else () -}; - -(:~ read next block from XMP stream :) -declare %private function pdfbox:read-stream($is,$read as xs:string) -as map(*){ - let $blen:=4096 - let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) - let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) - let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() - return map{"n":$n, "data": $read || $data} -}; - -(:~ outline for $pdf as map()* :) -declare function pdfbox:outline($pdf as item()) -as map(*)*{ - (# db:wrapjava some #) { - let $outline:= - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getDocumentOutline() - - return if(exists($outline)) - then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) - } -}; - -(:~ return bookmark info for children of $outlineItem as seq of maps :) -declare function pdfbox:outline($pdf as item(),$outlineItem as item()?) -as map(*)*{ - let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) - return map:get($find,"list") -}; - -(:~ BaseX bug 10.7? error if inlined in outline :) -declare %private function pdfbox:outline_($pdf as item(),$outlineItem as item()?) -as map(*){ - pdfbox:do-until( - - map{"list":(),"this":$outlineItem}, - - function($input,$pos ) { - let $bk:= pdfbox:bookmark($input?this,$pdf) - let $bk:= if($bk?hasChildren) - then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) - return map:merge(($bk,map:entry("children",$kids))) - else $bk - return map{ - "list": ($input?list, $bk), - "this": PDOutlineItem:getNextSibling($input?this)} - }, - - function($output,$pos) { empty($output?this) } - ) -}; - -(:~ PDF outline in xml format :) -declare function pdfbox:outline-xml($pdf as item()) -as element(outline)?{ - let $outline:=pdfbox:outline($pdf) - return if(exists($outline)) - then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline> - else () -}; - -(:~ recursive ouutline map to XML :) -declare %private function pdfbox:bookmark-xml($outline as map(*)*) -as element(bookmark)* -{ - $outline! - <bookmark title="{?title}" index="{?index}"> - {?children!pdfbox:bookmark-xml(.)} - </bookmark> -}; - -(:~ return bookmark info for $bookmark -@return map{index:..,title:..,hasChildren:..} -:) -declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) -as map(*) -{ - map{ - "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), - "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} - (:=>translate("�",""), :), - "hasChildren": PDOutlineItem:hasChildren($bookmark) - } -}; - - -(:~ pageIndex of $page in $pdf :) -declare function pdfbox:find-page( - $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), - $pdf as item()) -as item()? -{ - if(exists($page)) - then PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPages() - =>PDPageTree:indexOf($page) -}; - -(:~ Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) -@param $start first page to include -@param $end last page to include -:) -declare function pdfbox:extract-range($pdf as item(), - $start as xs:integer,$end as xs:integer) -as xs:base64Binary -{ - let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() - return (pdfbox:binary($a),pdfbox:close($a)) -}; - - -(:~ pageLabel for every page or empty if none -@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples -@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files -:) -declare function pdfbox:labels($pdf as item()) -as xs:string* -{ - let $pagelabels:=PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() - return if(exists($pagelabels)) - then PDPageLabels:getLabelsByPageIndices($pagelabels) - else () -}; - -(:~ return text on $pageNo :) -declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer) -as xs:string{ - let $tStripper := (# db:wrapjava instance #) { - PDFTextStripper:new() - => PDFTextStripper:setStartPage($pageNo) - => PDFTextStripper:setEndPage($pageNo) - } - return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} -}; - -(:~ return size of $pageNo (zero based) -@result e.g. [0.0,0.0,168.0,239.52] - :) -declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) -as xs:string{ - PDDocument:getPage($pdf, $pageNo) - =>PDPage:getMediaBox() - =>PDRectangle:toString() -}; - -(:~ version of Apache Pdfbox in use e.g. "3.0.4" :) -declare function pdfbox:version() -as xs:string{ - Q{java:org.apache.pdfbox.util.Version}getVersion() -}; - -(:~ convert date :) -declare %private -function pdfbox:gregToISO($item as item()?) -as xs:string?{ - if(exists($item)) - then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() - else () -}; - -(:~ fn:do-until shim for BaseX 9+10 -if fn:do-until not found use hof:until, note: $pos always zero -:) -declare %private function pdfbox:do-until( - $input as item()*, - $action as function(item()*, xs:integer) as item()*, - $predicate as function(item()*, xs:integer) as xs:boolean? -) as item()* -{ - let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3) - return if(exists($fn)) - then $fn($input,$action,$predicate) - else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3) - return if(exists($hof)) - then $hof($predicate(?,0),$action(?,0),$input) - else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") - -}; - \ No newline at end of file diff --git a/docs/xqdoc/resources/base.css b/docs/xqdoc/resources/base.css deleted file mode 100644 index ff84327..0000000 --- a/docs/xqdoc/resources/base.css +++ /dev/null @@ -1,1153 +0,0 @@ -/****************************************************************************** - * Style sheet for the W3C specifications * - * - * Special classes handled by this style sheet include: - * - * Indices - * - .toc for the Table of Contents (
                                      ) - * + for the section numbers - * - #toc for the Table of Contents (