From e6832289b6bf29c27b912caf44a85a69f669f789 Mon Sep 17 00:00:00 2001 From: Andy Bunce Date: Tue, 3 Jun 2025 22:34:21 +0100 Subject: [PATCH] [mod] labels --- docs/xqdoc/annotations.html | 2 +- docs/xqdoc/imports.html | 2 +- docs/xqdoc/index.html | 6 +- docs/xqdoc/modules/F000001/index.html | 384 +++++++++++++++++-------- docs/xqdoc/modules/F000001/xqdoc.xml | 307 ++++++++++++++------ docs/xqdoc/modules/F000001/xqparse.xml | 157 +++++++--- docs/xqdoc/restxq.html | 2 +- docs/xqdoc/validation-report.xml | 2 +- docs/xqdoc/xqdoca.xml | 2 +- package.json | 2 +- src/Pdfbox3.xqm | 147 +++++++--- 11 files changed, 726 insertions(+), 287 deletions(-) diff --git a/docs/xqdoc/annotations.html b/docs/xqdoc/annotations.html index 8ca7cde..1830185 100644 --- a/docs/xqdoc/annotations.html +++ b/docs/xqdoc/annotations.html @@ -8,4 +8,4 @@ Contents
  1. Summary
  2. Annotations
    1. 2.1 http://www.w3.org/2012/xquery

Summary

This project uses 1 annotation namespaces.

Related documents
ViewDescriptionFormat
reportIndex of sourcesxhtml
restxqSummary of REST interfacexhtml
importsSummary of import usagexhtml
imports-diagProject wide module imports as html mermaid class diagramhtml5
imports-diag.mmdProject wide module imports as a mermaid class diagramtext
xqdoca.xmlxqDocA run configuration report (XML)xml
xqdoc-validatevalidate generated xqdoc filesxml

Annotations

2.1 http://www.w3.org/2012/xquery

private
\ No newline at end of file +   on Tuesday, 3rd June 2025

\ No newline at end of file diff --git a/docs/xqdoc/imports.html b/docs/xqdoc/imports.html index 8cee469..f8924ef 100644 --- a/docs/xqdoc/imports.html +++ b/docs/xqdoc/imports.html @@ -6,4 +6,4 @@ Contents
  1. Summary
  2. Imports

    Summary

    Lists all modules imported.

    Related documents
    ViewDescriptionFormat
    reportIndex of sourcesxhtml
    restxqSummary of REST interfacexhtml
    imports-diagProject wide module imports as html mermaid class diagramhtml5
    imports-diag.mmdProject wide module imports as a mermaid class diagramtext
    annotationsSummary of XQuery annotation usexhtml
    xqdoca.xmlxqDocA run configuration report (XML)xml
    xqdoc-validatevalidate generated xqdoc filesxml

    Imports (0)

    \ No newline at end of file +   on Tuesday, 3rd June 2025

    \ No newline at end of file diff --git a/docs/xqdoc/index.html b/docs/xqdoc/index.html index 7f52373..f2a36ba 100644 --- a/docs/xqdoc/index.html +++ b/docs/xqdoc/index.html @@ -6,9 +6,9 @@ 1 XQuery source files, and uses 1 annotation namespaces.

    This document was built from source folder C:/Users/mrwhe/git/expkg-zone58/pdfbox/src/ on - Sunday, 1st June 2025.

    Related documents
    ViewDescriptionFormat
    reportIndex of sourcesxhtml
    restxqSummary of REST interfacexhtml
    importsSummary of import usagexhtml
    imports-diagProject wide module imports as html mermaid class diagramhtml5
    imports-diag.mmdProject wide module imports as a mermaid class diagramtext
    annotationsSummary of XQuery annotation usexhtml
    xqdoca.xmlxqDocA run configuration report (XML)xml
    xqdoc-validatevalidate generated xqdoc filesxml

    XQuery Main (0)

    None

    XQuery Library (1)

    UriPrefixDescriptionUseAMetrics
    org.expkg_zone58.Pdfbox3pdfbox + Tuesday, 3rd June 2025.

    Related documents
    ViewDescriptionFormat
    reportIndex of sourcesxhtml
    restxqSummary of REST interfacexhtml
    importsSummary of import usagexhtml
    imports-diagProject wide module imports as html mermaid class diagramhtml5
    imports-diag.mmdProject wide module imports as a mermaid class diagramtext
    annotationsSummary of XQuery annotation usexhtml
    xqdoca.xmlxqDocA run configuration report (XML)xml
    xqdoc-validatevalidate generated xqdoc filesxml

    XQuery Main (0)

    None

    XQuery Library (1)

    UriPrefixDescriptionUseAMetrics
    org.expkg_zone58.Pdfbox3pdfbox -A BaseX 10.7+ interface to pdfbox 3.0 https...
    0
    Library
    ↖0
    P
    V#1
    F#31

    File view (1)

    Annotation namespaces (1)

    A total of 7 annotations are defined. +A BaseX 10.7+ interface to pdfbox3 https://...

    0
    Library
    ↖0
    P
    V#1
    F#37

    File view (1)

    Annotation namespaces (1)

    A total of 7 annotations are defined.

    http://www.w3.org/2012/xquery

    private7
    \ No newline at end of file +   on Tuesday, 3rd June 2025

    \ No newline at end of file diff --git a/docs/xqdoc/modules/F000001/index.html b/docs/xqdoc/modules/F000001/index.html index 7331d1c..0ed301c 100644 --- a/docs/xqdoc/modules/F000001/index.html +++ b/docs/xqdoc/modules/F000001/index.html @@ -1,12 +1,14 @@ src - xqDocA - xqDocA

    org.expkg_zone58.Pdfbox3  library module
    P

    Summary

    +

    org.expkg_zone58.Pdfbox3

    1. 1 Summary
    2. 2 Imports
    3. 3 Variables
      1. 3.1$pdfbox:property-map
        P
    4. 4 Functions
      1. 4.1binary
      2. 4.2bookmark
        P
      3. 4.3bookmark-xml
        P
      4. 4.4close
      5. 4.5do-until
        P
      6. 4.6extract-range
      7. 4.7find-page
      8. 4.8gregToISO
        P
      9. 4.9label-as-map
      10. 4.10label-as-string
      11. 4.11labels-as-map
      12. 4.12labels-as-strings
      13. 4.13labels-by-page
      14. 4.14metadata
      15. 4.15number-of-bookmarks
      16. 4.16number-of-labels
      17. 4.17number-of-pages
      18. 4.18open
      19. 4.19outline
      20. 4.20outline-xml
      21. 4.21outline_
        P
      22. 4.22page-labels
      23. 4.23page-media-box
      24. 4.24page-render
      25. 4.25page-text
      26. 4.26pdf-save
      27. 4.27property
      28. 4.28property-names
      29. 4.29read-stream
        P
      30. 4.30report
      31. 4.31report-save
      32. 4.32specification
      33. 4.33version
      34. 4.34with-pdf
    5. 5 Namespaces
    6. 6 RestXQ
    7. 7 Source

    Summary

    -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , -requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.5.jar -
    See also
    Authors
    • Andy Bunce 2025
    Custom
    Related documents
    ViewDescriptionFormat
    xqdocxqDoc xml file from the source modulexml
    xqparsexqparse xml file from the source modulexml

    Imports

    +A BaseX 10.7+ interface to pdfbox3 https://pdfbox.apache.org/ , +requires pdfbox jars on classpath, in lib/custom or xar +refer to the same concept. Also label and (page)range are used interchangably +

    See also
    Authors
    • Andy Bunce 2025
    Custom
    Related documents
    ViewDescriptionFormat
    xqdocxqDoc xml file from the source modulexml
    xqparsexqparse xml file from the source modulexml

    Imports

    This module is imported by 0 modules. It imports 0 modules. @@ -14,12 +16,12 @@ tested with pdfbox-app-3.0.5.jar property access map keys are property names, values are sequences of functions to get property from $pdf object -

    Type
    References 14 functions from 3 modules
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getAuthor#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getCreationDate#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getCreator#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getKeywords#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getModificationDate#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getProducer#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getSubject#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getTitle#1
    • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentInformation#1
    • pdfbox:gregToISO#1
    • pdfbox:hasLabels#1
    • pdfbox:hasOutline#1
    • pdfbox:number-of-pages#1
    • pdfbox:specification#1
    Annotations (1)
    %private()
    Source ( 35 lines)
    variable $pdfbox:property-map:=map{
    -  "pageCount": pdfbox:number-of-pages#1,
    +
    Type
    References 15 functions from 3 modules
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getAuthor#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getCreationDate#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getCreator#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getKeywords#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getModificationDate#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getProducer#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getSubject#1
    • {java:org.apache.pdfbox.pdmodel.PDDocumentInformation}getTitle#1
    • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentInformation#1
    • pdfbox:gregToISO#1
    • pdfbox:labels-as-strings#1
    • pdfbox:number-of-bookmarks#1
    • pdfbox:number-of-labels#1
    • pdfbox:number-of-pages#1
    • pdfbox:specification#1
    Annotations (1)
    %private()
    Source ( 36 lines)
    variable $pdfbox:property-map:=map{
    +  "#pages": pdfbox:number-of-pages#1,
     
    -  "hasOutline": pdfbox:hasOutline#1,
    +  "#bookmarks": pdfbox:number-of-bookmarks#1,
     
    -  "hasLabels": pdfbox:hasLabels#1,
    +  "#labels": pdfbox:number-of-labels#1,
     
       "specification":pdfbox:specification#1,
     
    @@ -47,7 +49,8 @@ values are sequences of functions to get property from $pdf object
     
       "modificationDate":  (PDDocument:getDocumentInformation#1,
                             PDDocumentInformation:getModificationDate#1,
    -                        pdfbox:gregToISO#1)
    +                        pdfbox:gregToISO#1),
    +   "labels":      pdfbox:labels-as-strings#1                     
     }

    Functions

    4.1 pdfbox:binary

    Arities: #1

    Summary
    Create binary representation of $pdf object as xs:base64Binary
    Signatures
    pdfbox:binary ( @@ -58,7 +61,7 @@ as xs:base64Binary{ return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() }

    4.2 pdfbox:bookmark

    Arities: #2P

    Summary
    -return bookmark info for $bookmark +Return bookmark info for $bookmark
    Signatures
    pdfbox:bookmark ( $bookmark as item(), $pdf as item() ) as map(*)
    Parameters
    • bookmark as item()
    • pdf as item()
    Return
    • map(*) map{index:..,title:..,hasChildren:..}
    Referenced by 1 functions from 1 modules
    References 3 functions from 1 modules
    • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}findDestinationPage#2
    • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getTitle#1
    • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}hasChildren#1
    Annotations (1)
    %private()
    Source ( 10 lines)
    function pdfbox:bookmark($bookmark as item(),$pdf as item())
    @@ -71,7 +74,7 @@ as map(*)
       "hasChildren": PDOutlineItem:hasChildren($bookmark)
       }
     }

    4.3 pdfbox:bookmark-xml

    Arities: #1P

    Summary
    -recursive ouutline map to XML
    Signatures
    pdfbox:bookmark-xml +Convert outline map to XML
    Signatures
    pdfbox:bookmark-xml ( $outline as map(*)* ) as element(bookmark)*
    Parameters
    • outline as map(*)*
    Return
    • element(bookmark) *
    Referenced by 2 functions from 1 modules
    References 1 functions from 1 modules
    Annotations (1)
    %private()
    Source ( 8 lines)
    function pdfbox:bookmark-xml($outline as map(*)*)
     as element(bookmark)*
    @@ -130,42 +133,78 @@ as item()?
           =>PDDocumentCatalog:getPages()
           =>PDPageTree:indexOf($page)
     }

    4.8 pdfbox:gregToISO

    Arities: #1P

    Summary
    -convert date
    Signatures
    pdfbox:gregToISO +Convert date
    Signatures
    pdfbox:gregToISO ( $item as item()? ) as xs:string?
    Parameters
    • item as item()?
    Return
    • xs:string ?
    Referenced by 0 functions from 0 modules
      References 2 functions from 2 modules
      • {http://www.w3.org/2005/xpath-functions}exists#1
      • {java:java.util.GregorianCalendar}toZonedDateTime#1
      Annotations (1)
      %private()
      Source ( 6 lines)
      function pdfbox:gregToISO($item as item()?)
       as xs:string?{
        if(exists($item))
        then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
        else ()
      -}

      4.9 pdfbox:hasLabels

      Arities: #1

      Summary
      -true if $pdf has Labels
      Signatures
      pdfbox:hasLabels +}

      4.9 pdfbox:label-as-map

      Arities: #2

      Summary
      +express label/page-range for $page as map
      Signatures
      pdfbox:label-as-map ( - $pdf as item() ) as xs:boolean
      Parameters
      • pdf as item()
      Return
      • xs:boolean
      Referenced by 0 functions from 0 modules
        References 1 functions from 1 modules
        • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
        Source ( 6 lines)
        function pdfbox:hasLabels($pdf as item())
        -as xs:boolean{
        -  PDDocument:getDocumentCatalog($pdf)
        -  =>PDDocumentCatalog:getPageLabels()
        -  =>exists()
        -}

        4.10 pdfbox:hasOutline

        Arities: #1

        Summary
        -true if $pdf has an outline
        Signatures
        pdfbox:hasOutline - ( - $pdf as item() ) as xs:boolean
        Parameters
        • pdf as item()
        Return
        • xs:boolean
        Referenced by 0 functions from 0 modules
          References 1 functions from 1 modules
          • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
          Source ( 6 lines)
          function pdfbox:hasOutline($pdf as item())
          -as xs:boolean{
          -  PDDocument:getDocumentCatalog($pdf)
          -  =>PDDocumentCatalog:getDocumentOutline()
          -  =>exists()
          -}

          4.11 pdfbox:labels

          Arities: #1

          Summary
          -pageLabel for every page or empty if none -
          Signatures
          pdfbox:labels - ( - $pdf as item() ) as xs:string*
          Parameters
          • pdf as item()
          Return
          • xs:string *
          Tags
          Referenced by 0 functions from 0 modules
            References 3 functions from 3 modules
            • {http://www.w3.org/2005/xpath-functions}exists#1
            • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
            • {java:org.apache.pdfbox.pdmodel.common.PDPageLabels}getLabelsByPageIndices#1
            Source ( 9 lines)
            function pdfbox:labels($pdf as item())
            -as xs:string*
            +			$pagelabels, $page as xs:integer ) as map(*)
            Parameters
            • pagelabels as 
            • page as xs:integer
            Return
            • map(*)
            Referenced by 1 functions from 1 modules
            References 5 functions from 3 modules
            • {http://www.w3.org/2005/xpath-functions}empty#1
            • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getPrefix#1
            • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getStart#1
            • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getStyle#1
            • {java:org.apache.pdfbox.pdmodel.common.PDPageLabels}getPageLabelRange#2
            Source ( 13 lines)
            function pdfbox:label-as-map($pagelabels,$page as  xs:integer)
            +as map(*)
             {
            +  let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page)
            +  return if(empty($label))
            +  then ()
            +  else map{
            +      "index": $page,
            +      "prefix": PDPageLabelRange:getPrefix($label),
            +      "start":  PDPageLabelRange:getStart($label),
            +      "style":  PDPageLabelRange:getStyle($label)
            +      }
            +}

            4.10 pdfbox:label-as-string

            Arities: #2

            Summary
            +label for $page formated as string
            Signatures
            pdfbox:label-as-string + ( + $pagelabels, $page as xs:integer ) as xs:string?
            Parameters
            • pagelabels as 
            • page as xs:integer
            Return
            • xs:string ?
            Referenced by 1 functions from 1 modules
            References 7 functions from 3 modules
            • {http://www.w3.org/2005/xpath-functions}empty#1
            • {http://www.w3.org/2005/xpath-functions}exists#1
            • {http://www.w3.org/2005/xpath-functions}string-join#1
            • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getPrefix#1
            • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getStart#1
            • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getStyle#1
            • {java:org.apache.pdfbox.pdmodel.common.PDPageLabels}getPageLabelRange#2
            Source ( 15 lines)
            function pdfbox:label-as-string($pagelabels,$page as  xs:integer)
            +as xs:string?{
            +  let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page)
            +  return  if(empty($label))
            +          then ()
            +          else
            +            let $start:=  PDPageLabelRange:getStart($label)
            +            let $style := PDPageLabelRange:getStyle($label)
            +            let $prefix:= PDPageLabelRange:getPrefix($label) 
            +            return string-join(($page, 
            +                                if(empty($style)) then "-" else $style,
            +                                if(($start eq 1)) then "" else $start,
            +                                if(exists($prefix)) then '*' || $prefix  (:TODO double " :)
            +                    ))
            +}

            4.11 pdfbox:labels-as-map

            Arities: #1

            Summary
            +sequence of maps for each label in
            Signatures
            pdfbox:labels-as-map + ( + $pdf as item() ) as map(*)*
            Parameters
            • pdf as item()
            Return
            • map(*) *
            Referenced by 0 functions from 0 modules
              References 3 functions from 2 modules
              Source ( 8 lines)
              function pdfbox:labels-as-map($pdf as item())
              +as map(*)*{
                 let $pagelabels:=PDDocument:getDocumentCatalog($pdf)
                                  =>PDDocumentCatalog:getPageLabels()
              -  return if(exists($pagelabels))
              -         then PDPageLabels:getLabelsByPageIndices($pagelabels)
              -         else ()
              -}

              4.12 pdfbox:metadata

              Arities: #1

              Summary
              + return $pagelabels + !(0 to pdfbox:number-of-pages($pdf)-1) + !pdfbox:label-as-map($pagelabels,.) +}

              4.12 pdfbox:labels-as-strings

              Arities: #1

              Summary
              +sequence of label ranges defined in PDF as formatted strings
              Signatures
              pdfbox:labels-as-strings + ( + $pdf as item() ) as xs:string
              Parameters
              • pdf as item()
              Return
              • xs:string
              Referenced by 0 functions from 0 modules
                References 3 functions from 2 modules
                Source ( 9 lines)
                function pdfbox:labels-as-strings($pdf as item())
                +as xs:string{
                +  let $pagelabels:=PDDocument:getDocumentCatalog($pdf)
                +                   =>PDDocumentCatalog:getPageLabels()
                +  return $pagelabels
                +         !(0 to pdfbox:number-of-pages($pdf)-1)
                +         !pdfbox:label-as-string($pagelabels,.)=>string-join(",")
                +            
                +}

                4.13 pdfbox:labels-by-page

                Arities: #1

                Summary
                +pageLabel for every page from derived from page-ranges +The returned sequence will contain at MOST as much entries as the document has pages. +
                Signatures
                pdfbox:labels-by-page + ( + $pdf as item() ) as xs:string*
                Parameters
                • pdf as item()
                Return
                • xs:string *
                Tags
                Referenced by 0 functions from 0 modules
                  References 1 functions from 1 modules
                  • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
                  Source ( 7 lines)
                  function pdfbox:labels-by-page($pdf as item())
                  +as xs:string*
                  +{
                  +  PDDocument:getDocumentCatalog($pdf)
                  +  =>PDDocumentCatalog:getPageLabels()
                  +  =>PDPageLabels:getLabelsByPageIndices()
                  +}

                  4.14 pdfbox:metadata

                  Arities: #1

                  Summary
                  XMP metadata as "RDF" document
                  Signatures
                  pdfbox:metadata ( @@ -186,18 +225,37 @@ as document-node(element(*))? function($output,$pos) { $output?n eq -1 } )?data=>parse-xml() else () -}

                  4.13 pdfbox:number-of-pages

                  Arities: #1

                  Summary
                  +}

                  4.15 pdfbox:number-of-bookmarks

                  Arities: #1

                  Summary
                  +number of outline items
                  Signatures
                  pdfbox:number-of-bookmarks + ( + $pdf as item() ) as xs:integer
                  Parameters
                  • pdf as item()
                  Return
                  • xs:integer
                  Referenced by 0 functions from 0 modules
                    References 2 functions from 2 modules
                    Source ( 5 lines)
                    function pdfbox:number-of-bookmarks($pdf as item())
                    +as xs:integer{
                    +  let $xml:=pdfbox:outline-xml($pdf)
                    +  return count($xml//bookmark)
                    +}

                    4.16 pdfbox:number-of-labels

                    Arities: #1

                    Summary
                    +The number of labels defined in PDF
                    Signatures
                    pdfbox:number-of-labels + ( + $pdf as item() ) as xs:integer
                    Parameters
                    • pdf as item()
                    Return
                    • xs:integer
                    Referenced by 0 functions from 0 modules
                      References 3 functions from 3 modules
                      • {http://www.w3.org/2005/xpath-functions}exists#1
                      • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
                      • {java:org.apache.pdfbox.pdmodel.common.PDPageLabels}getPageRangeCount#1
                      Source ( 9 lines)
                      function pdfbox:number-of-labels($pdf as item())
                      +as xs:integer
                      +{
                      +  let $labels:=PDDocument:getDocumentCatalog($pdf)
                      +               =>PDDocumentCatalog:getPageLabels()
                      +  return if(exists($labels)) 
                      +         then PDPageLabels:getPageRangeCount($labels)
                      +         else 0
                      +}

                      4.17 pdfbox:number-of-pages

                      Arities: #1

                      Summary
                      Number of pages in PDF
                      Signatures
                      pdfbox:number-of-pages ( - $pdf as item() ) as xs:integer
                      Parameters
                      • pdf as item()
                      Return
                      • xs:integer
                      Referenced by 0 functions from 0 modules
                        References 1 functions from 1 modules
                        • {java:org.apache.pdfbox.pdmodel.PDDocument}getNumberOfPages#1
                        Source ( 4 lines)
                        function pdfbox:number-of-pages($pdf as item())
                        +			$pdf as item() ) as xs:integer
                        Parameters
                        Return
                        Referenced by 2 functions from 1 modules
                        References 1 functions from 1 modules
                        Source ( 4 lines)
                        function pdfbox:number-of-pages($pdf as item())
                         as xs:integer{
                           PDDocument:getNumberOfPages($pdf)
                        -}

                        4.14 pdfbox:open

                        Arities: #1#2

                        Summary
                        +}

                        4.18 pdfbox:open

                        Arities: #1#2

                        Summary
                        open pdf using fetch:binary, returns pdf object
                        Signatures
                        pdfbox:open ( $pdfsrc as item() ) as item()
                        pdfbox:open ( - $pdfsrc as item(), $opts as map(*) ) as item()
                        Parameters
                        • pdfsrc as item() a fetchable url or filepath, or xs:base64Binary item
                        • opts as map(*) options otionally with map {"password":}
                        Return
                        • item()
                        Referenced by 3 functions from 1 modules
                        References 8 functions from 6 modules
                        • {http://basex.org/modules/fetch}binary#1
                        • {http://www.w3.org/2001/XMLSchema}QName#1
                        • {http://www.w3.org/2005/xpath-functions}error#2
                        • {http://www.w3.org/2005/xpath-functions}starts-with#2
                        • {http://www.w3.org/2005/xpath-functions}string#1
                        • {java:org.apache.pdfbox.Loader}loadPDF#2
                        • {java:org.apache.pdfbox.io.RandomAccessReadBufferedFile}new#1
                        • pdfbox:open#2
                        Source ( 21 lines)
                        function pdfbox:open($pdfsrc as item())
                        +			$pdfsrc as item(), $opts as map(*) ) as item()
                        Parameters
                        Return
                        Tags
                        • @note: + fetch:binary for https will use a lot of memory here
                        Referenced by 3 functions from 1 modules
                        References 8 functions from 6 modules
                        Source ( 21 lines)
                        function pdfbox:open($pdfsrc as item())
                         as item(){
                         pdfbox:open($pdfsrc, map{})
                         }
                        function pdfbox:open($pdfsrc as item(), $opts as map(*))
                        @@ -216,7 +274,7 @@ as item(){
                                       else $pdfsrc
                             return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description)
                         }
                        -}

                        4.15 pdfbox:outline

                        Arities: #1#2

                        Summary
                        +}

                        4.19 pdfbox:outline

                        Arities: #1#2

                        Summary
                        outline for $pdf as map()*
                        Signatures
                        pdfbox:outline ( $pdf as item() ) as map(*)*
                        pdfbox:outline @@ -235,16 +293,16 @@ as map(*)*{ as map(*)*{ let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) return map:get($find,"list") -}

                        4.16 pdfbox:outline-xml

                        Arities: #1

                        Summary
                        +}

                        4.20 pdfbox:outline-xml

                        Arities: #1

                        Summary
                        PDF outline in xml format
                        Signatures
                        pdfbox:outline-xml ( - $pdf as item() ) as element(outline)?
                        Parameters
                        • pdf as item()
                        Return
                        • element(outline) ?
                        Referenced by 0 functions from 0 modules
                          References 3 functions from 2 modules
                          Source ( 7 lines)
                          function pdfbox:outline-xml($pdf as item())
                          +			$pdf as item() ) as element(outline)?
                          Parameters
                          Return
                          Referenced by 1 functions from 1 modules
                          References 3 functions from 2 modules
                          Source ( 7 lines)
                          function pdfbox:outline-xml($pdf as item())
                           as element(outline)?{
                            let $outline:=pdfbox:outline($pdf)
                             return if(exists($outline))
                                    then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline>
                                    else ()
                          -}

                          4.17 pdfbox:outline_

                          Arities: #2P

                          Summary
                          +}

                          4.21 pdfbox:outline_

                          Arities: #2P

                          Summary
                          BaseX bug 10.7? error if inlined in outline
                          Signatures
                          pdfbox:outline_ ( $pdf as item(), $outlineItem as item()? ) as map(*)
                          Parameters
                          • pdf as item()
                          • outlineItem as item()?
                          Return
                          • map(*)
                          Referenced by 1 functions from 1 modules
                          References 8 functions from 4 modules
                          • {http://www.w3.org/2005/xpath-functions/map}entry#2
                          • {http://www.w3.org/2005/xpath-functions/map}merge#1
                          • {http://www.w3.org/2005/xpath-functions}empty#1
                          • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getFirstChild#1
                          • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getNextSibling#1
                          • pdfbox:bookmark#2
                          • pdfbox:do-until#3
                          • pdfbox:outline#2
                          Annotations (1)
                          %private()
                          Source ( 20 lines)
                          function pdfbox:outline_($pdf as item(),$outlineItem as item()?)
                          @@ -266,8 +324,15 @@ as map(*){
                           
                                function($output,$pos) { empty($output?this) }                      
                             )
                          -}

                          4.18 pdfbox:page-media-box

                          Arities: #2

                          Summary
                          -return size of $pageNo (zero based) +}

                          4.22 pdfbox:page-labels

                          Arities: #1

                          Summary
                          +get pagelabels exist
                          Signatures
                          pdfbox:page-labels + ( + $pdf )
                          Parameters
                          • pdf as 
                          Return
                          Referenced by 0 functions from 0 modules
                            References 1 functions from 1 modules
                            • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
                            Source ( 5 lines)
                            function pdfbox:page-labels($pdf)
                            +{
                            +  PDDocument:getDocumentCatalog($pdf)
                            +  =>PDDocumentCatalog:getPageLabels()
                            +}

                            4.23 pdfbox:page-media-box

                            Arities: #2

                            Summary
                            +Return size of $pageNo (zero based)
                            Signatures
                            pdfbox:page-media-box ( $pdf as item(), $pageNo as xs:integer ) as xs:string
                            Parameters
                            • pdf as item()
                            • pageNo as xs:integer
                            Return
                            • xs:string
                            Tags
                            • @result: @@ -276,20 +341,21 @@ as xs:string{ PDDocument:getPage($pdf, $pageNo) =>PDPage:getMediaBox() =>PDRectangle:toString() -}

                            4.19 pdfbox:page-render

                            Arities: #3

                            Summary
                            +}

                            4.24 pdfbox:page-render

                            Arities: #3

                            Summary
                            Pdf page as image (zero is cover) options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi??
                            Signatures
                            pdfbox:page-render ( - $pdf as item(), $pageNo as xs:integer, $options as map(*) ) as xs:base64Binary
                            Parameters
                            • pdf as item()
                            • pageNo as xs:integer
                            • options as map(*)
                            Return
                            • xs:base64Binary
                            Referenced by 0 functions from 0 modules
                              References 5 functions from 4 modules
                              • {http://www.w3.org/2005/xpath-functions/map}merge#1
                              • {java:java.io.ByteArrayOutputStream}new#0
                              • {java:java.io.ByteArrayOutputStream}toByteArray#1
                              • {java:javax.imageio.ImageIO}write#3
                              • {java:org.apache.pdfbox.rendering.PDFRenderer}new#1
                              Source ( 10 lines)
                              function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*))
                              +			$pdf as item(), $pageNo as xs:integer, $options as map(*) ) as xs:base64Binary
                              Parameters
                              Return
                              Referenced by 0 functions from 0 modules
                              References 5 functions from 4 modules
                              Source ( 11 lines)
                              function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*))
                               as xs:base64Binary{
                              -  let $options:=map:merge(($options,map{"format":"jpg","scale":1}))
                              -  let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale)
                              -  let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
                              -  let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format,  $bytes)
                              +  let $options := map:merge(($options,map{"format":"jpg","scale":1}))
                              +  let $bufferedImage := PDFRenderer:new($pdf)
                              +                      =>PDFRenderer:renderImage($pageNo,$options?scale)
                              +  let $bytes := Q{java:java.io.ByteArrayOutputStream}new()
                              +  let $_ := Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format,  $bytes)
                                 return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
                                        =>convert:integers-to-base64()
                                
                              -}

                              4.20 pdfbox:page-text

                              Arities: #2

                              Summary
                              +}

                              4.25 pdfbox:page-text

                              Arities: #2

                              Summary
                              return text on $pageNo
                              Signatures
                              pdfbox:page-text ( $pdf as item(), $pageNo as xs:integer ) as xs:string
                              Parameters
                              • pdf as item()
                              • pageNo as xs:integer
                              Return
                              • xs:string
                              Referenced by 0 functions from 0 modules
                                References 2 functions from 1 modules
                                • {java:org.apache.pdfbox.text.PDFTextStripper}getText#2
                                • {java:org.apache.pdfbox.text.PDFTextStripper}new#0
                                Source ( 9 lines)
                                function pdfbox:page-text($pdf as item(), $pageNo as xs:integer)
                                @@ -300,7 +366,13 @@ as xs:string{
                                          => PDFTextStripper:setEndPage($pageNo)
                                        }
                                   return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
                                -}

                                4.21 pdfbox:property

                                Arities: #2

                                Summary
                                +}

                                4.26 pdfbox:pdf-save

                                Arities: #2

                                Summary
                                +Save pdf $pdf to filesystem at $savepath , returns $savepath
                                Signatures
                                pdfbox:pdf-save + ( + $pdf as item(), $savepath as xs:string ) as xs:string
                                Parameters
                                • pdf as item()
                                • savepath as xs:string
                                Return
                                • xs:string
                                Referenced by 0 functions from 0 modules
                                  References 2 functions from 2 modules
                                  • {java:java.io.File}new#1
                                  • {java:org.apache.pdfbox.pdmodel.PDDocument}save#2
                                  Source ( 4 lines)
                                  function pdfbox:pdf-save($pdf as item(),$savepath as xs:string)
                                  +as xs:string{
                                  +   PDDocument:save($pdf, File:new($savepath)),$savepath
                                  +}

                                  4.27 pdfbox:property

                                  Arities: #2

                                  Summary
                                  return value of $property for $pdf
                                  Signatures
                                  pdfbox:property ( $pdf as item(), $property as xs:string ) as item()*
                                  Parameters
                                  • pdf as item()
                                  • property as xs:string
                                  Return
                                  • item() *
                                  Referenced by 1 functions from 1 modules
                                  References 5 functions from 2 modules
                                  • {http://www.w3.org/2001/XMLSchema}QName#1
                                  • {http://www.w3.org/2005/xpath-functions}concat#3
                                  • {http://www.w3.org/2005/xpath-functions}error#2
                                  • {http://www.w3.org/2005/xpath-functions}exists#1
                                  • {http://www.w3.org/2005/xpath-functions}fold-left#3
                                  Source ( 9 lines)
                                  function pdfbox:property($pdf as item(),$property as xs:string)
                                  @@ -309,15 +381,15 @@ as item()*{
                                     return if(exists($fns))
                                            then fold-left($fns, 
                                                           $pdf, 
                                  -                        function($result,$this as function(*)){$this($result)})
                                  +                        function($result,$this as function(*)){$result!$this(.)})
                                            else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined."))
                                  -}

                                  4.22 pdfbox:property-names

                                  Arities: #0

                                  Summary
                                  +}

                                  4.28 pdfbox:property-names

                                  Arities: #0

                                  Summary
                                  known property names sorted
                                  Signatures
                                  pdfbox:property-names ( - ) as xs:string*
                                  Return
                                  • xs:string *
                                  Referenced by 0 functions from 0 modules
                                    Source ( 4 lines)
                                    function pdfbox:property-names() 
                                    +			) as xs:string*
                                    Return
                                    Referenced by 1 functions from 1 modules
                                    Source ( 4 lines)
                                    function pdfbox:property-names() 
                                     as xs:string*{
                                       $pdfbox:property-map=>map:keys()=>sort()
                                    -}

                                    4.23 pdfbox:read-stream

                                    Arities: #2P

                                    Summary
                                    +}

                                    4.29 pdfbox:read-stream

                                    Arities: #2P

                                    Summary
                                    read next block from XMP stream
                                    Signatures
                                    pdfbox:read-stream ( $is, $read as xs:string ) as map(*)
                                    Parameters
                                    • is as 
                                    • read as xs:string
                                    Return
                                    • map(*)
                                    Referenced by 1 functions from 1 modules
                                    References 6 functions from 5 modules
                                    • {http://basex.org/modules/convert}integers-to-base64#1
                                    • {http://www.w3.org/2001/XMLSchema}byte#1
                                    • {http://www.w3.org/2001/XMLSchema}int#1
                                    • {http://www.w3.org/2005/xpath-functions}subsequence#3
                                    • {java:java.util.Arrays}copyOf#2
                                    • {java:org.apache.pdfbox.cos.COSInputStream}read#4
                                    Annotations (1)
                                    %private()
                                    Source ( 8 lines)
                                    function pdfbox:read-stream($is,$read as xs:string)
                                    @@ -327,15 +399,15 @@ as map(*){
                                       let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen))
                                       let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string()
                                       return map{"n":$n, "data": $read || $data}
                                    -}

                                    4.24 pdfbox:report

                                    Arities: #1#2

                                    Summary
                                    +}

                                    4.30 pdfbox:report

                                    Arities: #1#2

                                    Summary
                                    summary CSV style info for all properties for $pdfpaths
                                    Signatures
                                    pdfbox:report ( $pdfpaths as xs:string* ) as map(*)
                                    pdfbox:report ( - $pdfpaths as item()*, $properties as xs:string* ) as map(*)
                                    Parameters
                                    • pdfpaths as item()*
                                    • properties as xs:string*
                                    Return
                                    • map(*)
                                    Tags
                                    Referenced by 1 functions from 1 modules
                                    References 8 functions from 4 modules
                                    Source ( 28 lines)
                                    function pdfbox:report($pdfpaths as xs:string*)
                                    +			$pdfpaths as item()*, $properties as xs:string* ) as map(*)
                                    Parameters
                                    Return
                                    Tags
                                    Referenced by 1 functions from 1 modules
                                    References 8 functions from 3 modules
                                    Source ( 28 lines)
                                    function pdfbox:report($pdfpaths as xs:string*)
                                     as map(*){
                                    - pdfbox:report($pdfpaths,map:keys($pdfbox:property-map))
                                    + pdfbox:report($pdfpaths,pdfbox:property-names())
                                     }
                                    function pdfbox:report($pdfpaths as item()*, $properties as xs:string*)
                                     as map(*){
                                       map{"names":   array{"path",$properties},
                                    @@ -359,13 +431,14 @@ as map(*){
                                                      }
                                                    
                                       }
                                    -}

                                    4.25 pdfbox:save

                                    Arities: #2

                                    Summary
                                    -Save pdf $pdf to filesystem at $savepath , returns $savepath
                                    Signatures
                                    pdfbox:save +}

                                    4.31 pdfbox:report-save

                                    Arities: #2

                                    Summary
                                    +convenience function to save report() data to file
                                    Signatures
                                    pdfbox:report-save ( - $pdf as item(), $savepath as xs:string ) as xs:string
                                    Parameters
                                    • pdf as item()
                                    • savepath as xs:string
                                    Return
                                    • xs:string
                                    Referenced by 0 functions from 0 modules
                                      References 2 functions from 2 modules
                                      • {java:java.io.File}new#1
                                      • {java:org.apache.pdfbox.pdmodel.PDDocument}save#2
                                      Source ( 4 lines)
                                      function pdfbox:save($pdf as item(),$savepath as xs:string)
                                      -as xs:string{
                                      -   PDDocument:save($pdf, File:new($savepath)),$savepath
                                      -}

                                      4.26 pdfbox:specification

                                      Arities: #1

                                      Summary
                                      + $data as map(*), $dest as xs:string ) as empty-sequence
                                      Parameters
                                      Return
                                      Referenced by 0 functions from 0 modules
                                      References 2 functions from 2 modules
                                      Source ( 5 lines)
                                      function pdfbox:report-save($data as map(*),$dest as xs:string)
                                      +as empty-sequence(){
                                      +  let $opts := map {  "format":"xquery", "header":"yes", "separator" : "," }
                                      +  return file:write-text($dest,csv:serialize($data,$opts))
                                      +}

                                      4.32 pdfbox:specification

                                      Arities: #1

                                      Summary
                                      The version of the PDF specification used by $pdf e.g "1.4" returned as string to avoid float rounding issues
                                      Signatures
                                      pdfbox:specification @@ -373,14 +446,14 @@ returned as string to avoid float rounding issues $pdf as item() ) as xs:string
                                      Parameters
                                      • pdf as item()
                                      Return
                                      • xs:string
                                      Referenced by 0 functions from 0 modules
                                        References 1 functions from 1 modules
                                        • {java:org.apache.pdfbox.pdmodel.PDDocument}getVersion#1
                                        Source ( 4 lines)
                                        function pdfbox:specification($pdf as item())
                                         as xs:string{
                                          PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string()
                                        -}

                                        4.27 pdfbox:version

                                        Arities: #0

                                        Summary
                                        -version of Apache Pdfbox in use e.g. "3.0.4"
                                        Signatures
                                        pdfbox:version +}

                                        4.33 pdfbox:version

                                        Arities: #0

                                        Summary
                                        +Version of Apache Pdfbox in use e.g. "3.0.4"
                                        Signatures
                                        pdfbox:version ( ) as xs:string
                                        Return
                                        • xs:string
                                        Referenced by 0 functions from 0 modules
                                          References 1 functions from 1 modules
                                          • {java:org.apache.pdfbox.util.Version}getVersion#0
                                          Source ( 4 lines)
                                          function pdfbox:version()
                                           as xs:string{
                                             Q{java:org.apache.pdfbox.util.Version}getVersion()
                                          -}

                                          4.28 pdfbox:with-pdf

                                          Arities: #2

                                          Summary
                                          -with-document pattern: open pdf,apply function, close pdf +}

                                          4.34 pdfbox:with-pdf

                                          Arities: #2

                                          Summary
                                          +"With-document" pattern: open pdf,apply $fn function, close pdf creates a local pdfobject and ensures it is closed after use e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
                                          Signatures
                                          pdfbox:with-pdf @@ -390,16 +463,18 @@ e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) as item()*{ let $pdf:=pdfbox:open($src) return try{ - $fn($pdf),pdfbox:close($pdf) + $fn($pdf),pdfbox:close($pdf) } catch *{ - pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) + pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) } -}

                                          Namespaces

                                          The following namespaces are defined:

                                          Prefix -Uri -
                                          arrayhttp://www.w3.org/2005/xpath-functions/array
                                          converthttp://basex.org/modules/convert
                                          COSInputStreamjava:org.apache.pdfbox.cos.COSInputStream
                                          dbhttp://basex.org/modules/db
                                          errhttp://www.w3.org/2005/xqt-errors
                                          fetchhttp://basex.org/modules/fetch
                                          Filejava:java.io.File
                                          fnhttp://www.w3.org/2005/xpath-functions
                                          Loaderjava:org.apache.pdfbox.Loader
                                          maphttp://www.w3.org/2005/xpath-functions/map
                                          PageExtractorjava:org.apache.pdfbox.multipdf.PageExtractor
                                          PDDocumentjava:org.apache.pdfbox.pdmodel.PDDocument
                                          PDDocumentCatalogjava:org.apache.pdfbox.pdmodel.PDDocumentCatalog
                                          PDDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformation
                                          PDDocumentOutlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline
                                          pdfboxorg.expkg_zone58.Pdfbox3
                                          PDFRendererjava:org.apache.pdfbox.rendering.PDFRenderer
                                          PDFTextStripperjava:org.apache.pdfbox.text.PDFTextStripper
                                          PDMetadatajava:org.apache.pdfbox.pdmodel.common.PDMetadata
                                          PDOutlineItemjava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem
                                          PDPagejava:org.apache.pdfbox.pdmodel.PDPage
                                          PDPageLabelsjava:org.apache.pdfbox.pdmodel.common.PDPageLabels
                                          PDPageTreejava:org.apache.pdfbox.pdmodel.PDPageTree
                                          PDRectangleorg.apache.pdfbox.pdmodel.common.PDRectangle
                                          RandomAccessReadBufferjava:org.apache.pdfbox.io.RandomAccessReadBuffer
                                          RandomAccessReadBufferedFilejava:org.apache.pdfbox.io.RandomAccessReadBufferedFile
                                          rdfhttp://www.w3.org/1999/02/22-rdf-syntax-ns#
                                          xshttp://www.w3.org/2001/XMLSchema

                                          6 RestXQ

                                          None

                                          Source Code

                                          xquery version '3.1';
                                          +}

                                          Namespaces

                                          The following namespaces are defined:

                                          Prefix -Uri -
                                          arrayhttp://www.w3.org/2005/xpath-functions/array
                                          converthttp://basex.org/modules/convert
                                          COSInputStreamjava:org.apache.pdfbox.cos.COSInputStream
                                          csvhttp://basex.org/modules/csv
                                          dbhttp://basex.org/modules/db
                                          errhttp://www.w3.org/2005/xqt-errors
                                          fetchhttp://basex.org/modules/fetch
                                          Filejava:java.io.File
                                          filehttp://expath.org/ns/file
                                          fnhttp://www.w3.org/2005/xpath-functions
                                          Loaderjava:org.apache.pdfbox.Loader
                                          maphttp://www.w3.org/2005/xpath-functions/map
                                          PageExtractorjava:org.apache.pdfbox.multipdf.PageExtractor
                                          PDDocumentjava:org.apache.pdfbox.pdmodel.PDDocument
                                          PDDocumentCatalogjava:org.apache.pdfbox.pdmodel.PDDocumentCatalog
                                          PDDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformation
                                          PDDocumentOutlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline
                                          pdfboxorg.expkg_zone58.Pdfbox3
                                          PDFRendererjava:org.apache.pdfbox.rendering.PDFRenderer
                                          PDFTextStripperjava:org.apache.pdfbox.text.PDFTextStripper
                                          PDMetadatajava:org.apache.pdfbox.pdmodel.common.PDMetadata
                                          PDOutlineItemjava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem
                                          PDPagejava:org.apache.pdfbox.pdmodel.PDPage
                                          PDPageLabelRangejava:org.apache.pdfbox.pdmodel.common.PDPageLabelRange
                                          PDPageLabelsjava:org.apache.pdfbox.pdmodel.common.PDPageLabels
                                          PDPageTreejava:org.apache.pdfbox.pdmodel.PDPageTree
                                          PDRectangleorg.apache.pdfbox.pdmodel.common.PDRectangle
                                          RandomAccessReadBufferjava:org.apache.pdfbox.io.RandomAccessReadBuffer
                                          RandomAccessReadBufferedFilejava:org.apache.pdfbox.io.RandomAccessReadBufferedFile
                                          rdfhttp://www.w3.org/1999/02/22-rdf-syntax-ns#
                                          xshttp://www.w3.org/2001/XMLSchema

                                          6 RestXQ

                                          None

                                          Source Code

                                          xquery version '3.1';
                                           (:~ 
                                          -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , 
                                          -requires pdfbox jars on classpath, i.e. in custom or xar
                                          -tested with pdfbox-app-3.0.5.jar
                                          +A BaseX 10.7+ interface to pdfbox3 https://pdfbox.apache.org/ , 
                                          +requires pdfbox jars on classpath, in lib/custom or xar
                                          +@note following the java source the terms outline and bookmark
                                          +refer to the same concept. Also label and (page)range are used interchangably
                                          +@note tested with pdfbox-app-3.0.5.jar
                                           @see https://pdfbox.apache.org/download.cgi
                                           @javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/
                                           @author Andy Bunce 2025
                                          @@ -412,6 +487,8 @@ declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper
                                           declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument";
                                           declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog";
                                           declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels";
                                          +declare namespace PDPageLabelRange="java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange";
                                          +
                                           declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor";
                                           declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage";
                                           declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree";
                                          @@ -422,6 +499,7 @@ declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";&#x
                                           declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata";
                                           declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream";
                                           
                                          +
                                           declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
                                           
                                           
                                          @@ -433,7 +511,7 @@ declare namespace File ="java:java.io.File";
                                           
                                           
                                           
                                          -(:~ with-document pattern: open pdf,apply function, close pdf
                                          +(:~ "With-document" pattern: open pdf,apply $fn function, close pdf
                                            creates a local pdfobject and ensures it is closed after use
                                           e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
                                           :)
                                          @@ -442,9 +520,9 @@ declare function pdfbox:with-pdf($src as xs:string,
                                           as item()*{
                                            let $pdf:=pdfbox:open($src)
                                            return try{
                                          -        $fn($pdf),pdfbox:close($pdf)
                                          +            $fn($pdf),pdfbox:close($pdf)
                                                   } catch *{
                                          -          pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description)
                                          +            pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description)
                                                   }
                                           
                                           };
                                          @@ -458,7 +536,8 @@ pdfbox:open($pdfsrc, map{})
                                           
                                           (:~ open pdf from file/url/binary, opts may have password , returns pdf object 
                                           @param $pdfsrc a fetchable url or filepath, or xs:base64Binary item
                                          -@param $opts options otionally with map {"password":} 
                                          +@param $opts options options include map {"password":}
                                          +@note fetch:binary for https will use a lot of memory here
                                           :)
                                           declare function pdfbox:open($pdfsrc as item(), $opts as map(*))
                                           as item(){
                                          @@ -487,7 +566,7 @@ as xs:string{
                                           };
                                           
                                           (:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :)
                                          -declare function pdfbox:save($pdf as item(),$savepath as xs:string)
                                          +declare function pdfbox:pdf-save($pdf as item(),$savepath as xs:string)
                                           as xs:string{
                                              PDDocument:save($pdf, File:new($savepath)),$savepath
                                           };
                                          @@ -519,10 +598,11 @@ as xs:integer{
                                           options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :)
                                           declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*))
                                           as xs:base64Binary{
                                          -  let $options:=map:merge(($options,map{"format":"jpg","scale":1}))
                                          -  let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale)
                                          -  let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
                                          -  let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format,  $bytes)
                                          +  let $options := map:merge(($options,map{"format":"jpg","scale":1}))
                                          +  let $bufferedImage := PDFRenderer:new($pdf)
                                          +                      =>PDFRenderer:renderImage($pageNo,$options?scale)
                                          +  let $bytes := Q{java:java.io.ByteArrayOutputStream}new()
                                          +  let $_ := Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format,  $bytes)
                                             return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
                                                    =>convert:integers-to-base64()
                                            
                                          @@ -534,11 +614,11 @@ as xs:base64Binary{
                                              values are sequences of functions to get property from $pdf object
                                           :)
                                           declare %private variable $pdfbox:property-map:=map{
                                          -  "pageCount": pdfbox:number-of-pages#1,
                                          +  "#pages": pdfbox:number-of-pages#1,
                                           
                                          -  "hasOutline": pdfbox:hasOutline#1,
                                          +  "#bookmarks": pdfbox:number-of-bookmarks#1,
                                           
                                          -  "hasLabels": pdfbox:hasLabels#1,
                                          +  "#labels": pdfbox:number-of-labels#1,
                                           
                                             "specification":pdfbox:specification#1,
                                           
                                          @@ -566,7 +646,8 @@ declare %private variable $pdfbox:property-map:=map{
                                           
                                             "modificationDate":  (PDDocument:getDocumentInformation#1,
                                                                   PDDocumentInformation:getModificationDate#1,
                                          -                        pdfbox:gregToISO#1)
                                          +                        pdfbox:gregToISO#1),
                                          +   "labels":      pdfbox:labels-as-strings#1                     
                                           };
                                           
                                           (:~ known property names sorted :)
                                          @@ -582,7 +663,7 @@ as item()*{
                                             return if(exists($fns))
                                                    then fold-left($fns, 
                                                                   $pdf, 
                                          -                        function($result,$this as function(*)){$this($result)})
                                          +                        function($result,$this as function(*)){$result!$this(.)})
                                                    else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined."))
                                           };
                                           
                                          @@ -590,7 +671,7 @@ as item()*{
                                           :)
                                           declare function pdfbox:report($pdfpaths as xs:string*)
                                           as map(*){
                                          - pdfbox:report($pdfpaths,map:keys($pdfbox:property-map))
                                          + pdfbox:report($pdfpaths,pdfbox:property-names())
                                           };
                                           
                                           (:~ summary CSV style info for named properties for $pdfpaths 
                                          @@ -621,20 +702,18 @@ as map(*){
                                             }
                                           };
                                           
                                          -(:~ true if $pdf has an outline :)
                                          -declare function pdfbox:hasOutline($pdf as item())
                                          -as xs:boolean{
                                          -  PDDocument:getDocumentCatalog($pdf)
                                          -  =>PDDocumentCatalog:getDocumentOutline()
                                          -  =>exists()
                                          +(:~ convenience function to save report() data to file :)
                                          +declare function pdfbox:report-save($data as map(*),$dest as xs:string)
                                          +as empty-sequence(){
                                          +  let $opts := map {  "format":"xquery", "header":"yes", "separator" : "," }
                                          +  return file:write-text($dest,csv:serialize($data,$opts))
                                           };
                                           
                                          -(:~ true if $pdf has Labels :)
                                          -declare function pdfbox:hasLabels($pdf as item())
                                          -as xs:boolean{
                                          -  PDDocument:getDocumentCatalog($pdf)
                                          -  =>PDDocumentCatalog:getPageLabels()
                                          -  =>exists()
                                          +(:~ number of outline items :)
                                          +declare function pdfbox:number-of-bookmarks($pdf as item())
                                          +as xs:integer{
                                          +  let $xml:=pdfbox:outline-xml($pdf)
                                          +  return count($xml//bookmark)
                                           };
                                           
                                           (:~ XMP metadata as "RDF" document
                                          @@ -719,7 +798,7 @@ as element(outline)?{
                                                    else ()
                                           };
                                           
                                          -(:~ recursive ouutline map to XML :)
                                          +(:~ Convert outline map to XML :)
                                           declare %private function pdfbox:bookmark-xml($outline as map(*)*)
                                           as element(bookmark)*
                                           {
                                          @@ -729,7 +808,7 @@ as element(bookmark)*
                                             </bookmark>
                                           };
                                           
                                          -(:~ return bookmark info for $bookmark
                                          +(:~ Return bookmark info for $bookmark
                                           @return map{index:..,title:..,hasChildren:..}
                                           :)
                                           declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item())
                                          @@ -768,21 +847,92 @@ as xs:base64Binary
                                               return (pdfbox:binary($a),pdfbox:close($a)) 
                                           };
                                           
                                          +(:~ The number of labels defined in PDF  :)
                                          +declare function pdfbox:number-of-labels($pdf as item())
                                          +as xs:integer
                                          +{
                                          +  let $labels:=PDDocument:getDocumentCatalog($pdf)
                                          +               =>PDDocumentCatalog:getPageLabels()
                                          +  return if(exists($labels)) 
                                          +         then PDPageLabels:getPageRangeCount($labels)
                                          +         else 0
                                          +};
                                           
                                          -(:~   pageLabel for every page or empty if none
                                          +(:~   pageLabel for every page from derived from page-ranges
                                          +The returned sequence will contain at MOST as much entries as the document has pages.
                                           @see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples
                                           @see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files
                                           :)
                                          -declare function pdfbox:labels($pdf as item())
                                          +declare function pdfbox:labels-by-page($pdf as item())
                                           as xs:string*
                                           {
                                          +  PDDocument:getDocumentCatalog($pdf)
                                          +  =>PDDocumentCatalog:getPageLabels()
                                          +  =>PDPageLabels:getLabelsByPageIndices()
                                          +};
                                          +
                                          +(:~ sequence of label ranges defined in PDF as formatted strings :)
                                          +declare function pdfbox:labels-as-strings($pdf as item())
                                          +as xs:string{
                                             let $pagelabels:=PDDocument:getDocumentCatalog($pdf)
                                                              =>PDDocumentCatalog:getPageLabels()
                                          -  return if(exists($pagelabels))
                                          -         then PDPageLabels:getLabelsByPageIndices($pagelabels)
                                          -         else ()
                                          +  return $pagelabels
                                          +         !(0 to pdfbox:number-of-pages($pdf)-1)
                                          +         !pdfbox:label-as-string($pagelabels,.)=>string-join(",")
                                          +            
                                           };
                                           
                                          +(:~ get pagelabels exist :)
                                          +declare function pdfbox:page-labels($pdf)
                                          +{
                                          +  PDDocument:getDocumentCatalog($pdf)
                                          +  =>PDDocumentCatalog:getPageLabels()
                                          +};
                                          +
                                          +(:~ label for $page formated as string :)
                                          +declare function pdfbox:label-as-string($pagelabels,$page as  xs:integer)
                                          +as xs:string?{
                                          +  let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page)
                                          +  return  if(empty($label))
                                          +          then ()
                                          +          else
                                          +            let $start:=  PDPageLabelRange:getStart($label)
                                          +            let $style := PDPageLabelRange:getStyle($label)
                                          +            let $prefix:= PDPageLabelRange:getPrefix($label) 
                                          +            return string-join(($page, 
                                          +                                if(empty($style)) then "-" else $style,
                                          +                                if(($start eq 1)) then "" else $start,
                                          +                                if(exists($prefix)) then '*' || $prefix  (:TODO double " :)
                                          +                    ))
                                          +};
                                          +
                                          +(:~ sequence of maps for each label in :)
                                          +declare function pdfbox:labels-as-map($pdf as item())
                                          +as map(*)*{
                                          +  let $pagelabels:=PDDocument:getDocumentCatalog($pdf)
                                          +                   =>PDDocumentCatalog:getPageLabels()
                                          +  return  $pagelabels
                                          +          !(0 to pdfbox:number-of-pages($pdf)-1)
                                          +          !pdfbox:label-as-map($pagelabels,.)
                                          +};
                                          +
                                          +(:~ express label/page-range for $page as map :)
                                          +declare function pdfbox:label-as-map($pagelabels,$page as  xs:integer)
                                          +as map(*)
                                          +{
                                          +  let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page)
                                          +  return if(empty($label))
                                          +  then ()
                                          +  else map{
                                          +      "index": $page,
                                          +      "prefix": PDPageLabelRange:getPrefix($label),
                                          +      "start":  PDPageLabelRange:getStart($label),
                                          +      "style":  PDPageLabelRange:getStyle($label)
                                          +      }
                                          +};
                                          +
                                          +
                                          +
                                           (:~ return text on $pageNo :)
                                           declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer)
                                           as xs:string{
                                          @@ -794,7 +944,7 @@ as xs:string{
                                             return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
                                           };
                                           
                                          -(:~ return size of $pageNo (zero based)
                                          +(:~ Return size of $pageNo (zero based)
                                           @result e.g. [0.0,0.0,168.0,239.52]
                                            :)
                                           declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer)
                                          @@ -804,13 +954,13 @@ as xs:string{
                                             =>PDRectangle:toString()
                                           };
                                           
                                          -(:~  version of Apache Pdfbox in use  e.g. "3.0.4" :)
                                          +(:~  Version of Apache Pdfbox in use  e.g. "3.0.4" :)
                                           declare function pdfbox:version()
                                           as xs:string{
                                             Q{java:org.apache.pdfbox.util.Version}getVersion()
                                           };
                                           
                                          -(:~ convert date :)
                                          +(:~ Convert date :)
                                           declare %private
                                           function pdfbox:gregToISO($item as item()?)
                                           as xs:string?{
                                          @@ -839,4 +989,4 @@ declare %private function pdfbox:do-until(
                                           };
                                           
                                          \ No newline at end of file +   on Tuesday, 3rd June 2025

                                          \ No newline at end of file diff --git a/docs/xqdoc/modules/F000001/xqdoc.xml b/docs/xqdoc/modules/F000001/xqdoc.xml index 8509c11..d6ff83d 100644 --- a/docs/xqdoc/modules/F000001/xqdoc.xml +++ b/docs/xqdoc/modules/F000001/xqdoc.xml @@ -1,13 +1,15 @@ -2025-06-01T21:16:07.687+01:001.1org.expkg_zone58.Pdfbox3pdfbox +2025-06-03T22:34:04.782+01:001.1org.expkg_zone58.Pdfbox3pdfbox -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , -requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.5.jar -Andy Bunce 2025https://pdfbox.apache.org/download.cgihttps://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/xquery version '3.1'; +A BaseX 10.7+ interface to pdfbox3 https://pdfbox.apache.org/ , +requires pdfbox jars on classpath, in lib/custom or xar +refer to the same concept. Also label and (page)range are used interchangably +Andy Bunce 2025https://pdfbox.apache.org/download.cgifollowing the java source the terms outline and bookmarktested with pdfbox-app-3.0.5.jarhttps://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/xquery version '3.1'; (:~ -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , -requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.5.jar +A BaseX 10.7+ interface to pdfbox3 https://pdfbox.apache.org/ , +requires pdfbox jars on classpath, in lib/custom or xar +@note following the java source the terms outline and bookmark +refer to the same concept. Also label and (page)range are used interchangably +@note tested with pdfbox-app-3.0.5.jar @see https://pdfbox.apache.org/download.cgi @javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/ @author Andy Bunce 2025 @@ -20,6 +22,8 @@ declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; +declare namespace PDPageLabelRange="java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange"; + declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage"; declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; @@ -30,6 +34,7 @@ declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";&#x declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata"; declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream"; + declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; @@ -41,7 +46,7 @@ declare namespace File ="java:java.io.File"; -(:~ with-document pattern: open pdf,apply function, close pdf +(:~ "With-document" pattern: open pdf,apply $fn function, close pdf creates a local pdfobject and ensures it is closed after use e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) :) @@ -50,9 +55,9 @@ declare function pdfbox:with-pdf($src as xs:string, as item()*{ let $pdf:=pdfbox:open($src) return try{ - $fn($pdf),pdfbox:close($pdf) + $fn($pdf),pdfbox:close($pdf) } catch *{ - pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) + pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) } }; @@ -66,7 +71,8 @@ pdfbox:open($pdfsrc, map{}) (:~ open pdf from file/url/binary, opts may have password , returns pdf object @param $pdfsrc a fetchable url or filepath, or xs:base64Binary item -@param $opts options otionally with map {"password":} +@param $opts options options include map {"password":} +@note fetch:binary for https will use a lot of memory here :) declare function pdfbox:open($pdfsrc as item(), $opts as map(*)) as item(){ @@ -95,7 +101,7 @@ as xs:string{ }; (:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) -declare function pdfbox:save($pdf as item(),$savepath as xs:string) +declare function pdfbox:pdf-save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath }; @@ -127,10 +133,11 @@ as xs:integer{ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) as xs:base64Binary{ - let $options:=map:merge(($options,map{"format":"jpg","scale":1})) - let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) - let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() - let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) + let $options := map:merge(($options,map{"format":"jpg","scale":1})) + let $bufferedImage := PDFRenderer:new($pdf) + =>PDFRenderer:renderImage($pageNo,$options?scale) + let $bytes := Q{java:java.io.ByteArrayOutputStream}new() + let $_ := Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() @@ -142,11 +149,11 @@ as xs:base64Binary{ values are sequences of functions to get property from $pdf object :) declare %private variable $pdfbox:property-map:=map{ - "pageCount": pdfbox:number-of-pages#1, + "#pages": pdfbox:number-of-pages#1, - "hasOutline": pdfbox:hasOutline#1, + "#bookmarks": pdfbox:number-of-bookmarks#1, - "hasLabels": pdfbox:hasLabels#1, + "#labels": pdfbox:number-of-labels#1, "specification":pdfbox:specification#1, @@ -174,7 +181,8 @@ declare %private variable $pdfbox:property-map:=map{ "modificationDate": (PDDocument:getDocumentInformation#1, PDDocumentInformation:getModificationDate#1, - pdfbox:gregToISO#1) + pdfbox:gregToISO#1), + "labels": pdfbox:labels-as-strings#1 }; (:~ known property names sorted :) @@ -190,7 +198,7 @@ as item()*{ return if(exists($fns)) then fold-left($fns, $pdf, - function($result,$this as function(*)){$this($result)}) + function($result,$this as function(*)){$result!$this(.)}) else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }; @@ -198,7 +206,7 @@ as item()*{ :) declare function pdfbox:report($pdfpaths as xs:string*) as map(*){ - pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) + pdfbox:report($pdfpaths,pdfbox:property-names()) }; (:~ summary CSV style info for named properties for $pdfpaths @@ -229,20 +237,18 @@ as map(*){ } }; -(:~ true if $pdf has an outline :) -declare function pdfbox:hasOutline($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getDocumentOutline() - =>exists() +(:~ convenience function to save report() data to file :) +declare function pdfbox:report-save($data as map(*),$dest as xs:string) +as empty-sequence(){ + let $opts := map { "format":"xquery", "header":"yes", "separator" : "," } + return file:write-text($dest,csv:serialize($data,$opts)) }; -(:~ true if $pdf has Labels :) -declare function pdfbox:hasLabels($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() - =>exists() +(:~ number of outline items :) +declare function pdfbox:number-of-bookmarks($pdf as item()) +as xs:integer{ + let $xml:=pdfbox:outline-xml($pdf) + return count($xml//bookmark) }; (:~ XMP metadata as "RDF" document @@ -327,7 +333,7 @@ as element(outline)?{ else () }; -(:~ recursive ouutline map to XML :) +(:~ Convert outline map to XML :) declare %private function pdfbox:bookmark-xml($outline as map(*)*) as element(bookmark)* { @@ -337,7 +343,7 @@ as element(bookmark)* </bookmark> }; -(:~ return bookmark info for $bookmark +(:~ Return bookmark info for $bookmark @return map{index:..,title:..,hasChildren:..} :) declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) @@ -376,21 +382,92 @@ as xs:base64Binary return (pdfbox:binary($a),pdfbox:close($a)) }; +(:~ The number of labels defined in PDF :) +declare function pdfbox:number-of-labels($pdf as item()) +as xs:integer +{ + let $labels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return if(exists($labels)) + then PDPageLabels:getPageRangeCount($labels) + else 0 +}; -(:~ pageLabel for every page or empty if none +(:~ pageLabel for every page from derived from page-ranges +The returned sequence will contain at MOST as much entries as the document has pages. @see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples @see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files :) -declare function pdfbox:labels($pdf as item()) +declare function pdfbox:labels-by-page($pdf as item()) as xs:string* { + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + =>PDPageLabels:getLabelsByPageIndices() +}; + +(:~ sequence of label ranges defined in PDF as formatted strings :) +declare function pdfbox:labels-as-strings($pdf as item()) +as xs:string{ let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() - return if(exists($pagelabels)) - then PDPageLabels:getLabelsByPageIndices($pagelabels) - else () + return $pagelabels + !(0 to pdfbox:number-of-pages($pdf)-1) + !pdfbox:label-as-string($pagelabels,.)=>string-join(",") + }; +(:~ get pagelabels exist :) +declare function pdfbox:page-labels($pdf) +{ + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() +}; + +(:~ label for $page formated as string :) +declare function pdfbox:label-as-string($pagelabels,$page as xs:integer) +as xs:string?{ + let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) + return if(empty($label)) + then () + else + let $start:= PDPageLabelRange:getStart($label) + let $style := PDPageLabelRange:getStyle($label) + let $prefix:= PDPageLabelRange:getPrefix($label) + return string-join(($page, + if(empty($style)) then "-" else $style, + if(($start eq 1)) then "" else $start, + if(exists($prefix)) then '*' || $prefix (:TODO double " :) + )) +}; + +(:~ sequence of maps for each label in :) +declare function pdfbox:labels-as-map($pdf as item()) +as map(*)*{ + let $pagelabels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return $pagelabels + !(0 to pdfbox:number-of-pages($pdf)-1) + !pdfbox:label-as-map($pagelabels,.) +}; + +(:~ express label/page-range for $page as map :) +declare function pdfbox:label-as-map($pagelabels,$page as xs:integer) +as map(*) +{ + let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) + return if(empty($label)) + then () + else map{ + "index": $page, + "prefix": PDPageLabelRange:getPrefix($label), + "start": PDPageLabelRange:getStart($label), + "style": PDPageLabelRange:getStyle($label) + } +}; + + + (:~ return text on $pageNo :) declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer) as xs:string{ @@ -402,7 +479,7 @@ as xs:string{ return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} }; -(:~ return size of $pageNo (zero based) +(:~ Return size of $pageNo (zero based) @result e.g. [0.0,0.0,168.0,239.52] :) declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) @@ -412,13 +489,13 @@ as xs:string{ =>PDRectangle:toString() }; -(:~ version of Apache Pdfbox in use e.g. "3.0.4" :) +(:~ Version of Apache Pdfbox in use e.g. "3.0.4" :) declare function pdfbox:version() as xs:string{ Q{java:org.apache.pdfbox.util.Version}getVersion() }; -(:~ convert date :) +(:~ Convert date :) declare %private function pdfbox:gregToISO($item as item()?) as xs:string?{ @@ -445,16 +522,16 @@ declare %private function pdfbox:do-until( else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") }; -pdfbox:property-map +pdfbox:property-map property access map keys are property names, values are sequences of functions to get property from $pdf object -org.expkg_zone58.Pdfbox3number-of-pagesorg.expkg_zone58.Pdfbox3hasOutlineorg.expkg_zone58.Pdfbox3hasLabelsorg.expkg_zone58.Pdfbox3specificationjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetTitlejava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetAuthorjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetCreatorjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetProducerjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetSubjectjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetKeywordsjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetCreationDateorg.expkg_zone58.Pdfbox3gregToISOjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetModificationDateorg.expkg_zone58.Pdfbox3gregToISOvariable $pdfbox:property-map:=map{ - "pageCount": pdfbox:number-of-pages#1, +org.expkg_zone58.Pdfbox3number-of-pagesorg.expkg_zone58.Pdfbox3number-of-bookmarksorg.expkg_zone58.Pdfbox3number-of-labelsorg.expkg_zone58.Pdfbox3specificationjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetTitlejava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetAuthorjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetCreatorjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetProducerjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetSubjectjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetKeywordsjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetCreationDateorg.expkg_zone58.Pdfbox3gregToISOjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformationgetModificationDateorg.expkg_zone58.Pdfbox3gregToISOorg.expkg_zone58.Pdfbox3labels-as-stringsvariable $pdfbox:property-map:=map{ + "#pages": pdfbox:number-of-pages#1, - "hasOutline": pdfbox:hasOutline#1, + "#bookmarks": pdfbox:number-of-bookmarks#1, - "hasLabels": pdfbox:hasLabels#1, + "#labels": pdfbox:number-of-labels#1, "specification":pdfbox:specification#1, @@ -482,9 +559,10 @@ values are sequences of functions to get property from $pdf object "modificationDate": (PDDocument:getDocumentInformation#1, PDDocumentInformation:getModificationDate#1, - pdfbox:gregToISO#1) + pdfbox:gregToISO#1), + "labels": pdfbox:labels-as-strings#1 } -with-document pattern: open pdf,apply function, close pdf +"With-document" pattern: open pdf,apply $fn function, close pdf creates a local pdfobject and ensures it is closed after use e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) pdfbox:with-pdffunction pdfbox:with-pdf ( $src as xs:string, $fn as function(item())as item()* ) as item()* { let $pdf:=pdfbox:open($src) return try{ $fn($pdf),pdfbox:close($pdf) } catch *{ pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) } }srcxs:stringfnfunction(item())as item()*item()org.expkg_zone58.Pdfbox3openorg.expkg_zone58.Pdfbox3closeorg.expkg_zone58.Pdfbox3closehttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2005/xqt-errorscodehttp://www.w3.org/2005/xqt-errorsdescriptionfunction pdfbox:with-pdf($src as xs:string, @@ -492,9 +570,9 @@ e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) as item()*{ let $pdf:=pdfbox:open($src) return try{ - $fn($pdf),pdfbox:close($pdf) + $fn($pdf),pdfbox:close($pdf) } catch *{ - pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) + pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) } } @@ -503,7 +581,7 @@ as item(){ pdfbox:open($pdfsrc, map{}) } open pdf from file/url/binary, opts may have password , returns pdf object -$pdfsrc a fetchable url or filepath, or xs:base64Binary item$opts options otionally with map {"password":}pdfbox:openfunction pdfbox:open ( $pdfsrc as item(), $opts as map(*) ) as item() { try{ if($pdfsrc instance of xs:base64Binary) then Loader:loadPDF( $pdfsrc,string($opts?password)) else if(starts-with($pdfsrc,"http")) then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) } catch *{ let $loc:=if($pdfsrc instance of xs:base64Binary) then "xs:base64Binary" else $pdfsrc return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) } }pdfsrcitem()optsmap(*)item()java:org.apache.pdfbox.LoaderloadPDFhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionsstarts-withjava:org.apache.pdfbox.LoaderloadPDFhttp://basex.org/modules/fetchbinaryhttp://www.w3.org/2005/xpath-functionsstringjava:org.apache.pdfbox.LoaderloadPDFjava:org.apache.pdfbox.io.RandomAccessReadBufferedFilenewhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xqt-errorsdescriptionfunction pdfbox:open($pdfsrc as item(), $opts as map(*)) +$pdfsrc a fetchable url or filepath, or xs:base64Binary item$opts options options include map {"password":}fetch:binary for https will use a lot of memory herepdfbox:openfunction pdfbox:open ( $pdfsrc as item(), $opts as map(*) ) as item() { try{ if($pdfsrc instance of xs:base64Binary) then Loader:loadPDF( $pdfsrc,string($opts?password)) else if(starts-with($pdfsrc,"http")) then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) } catch *{ let $loc:=if($pdfsrc instance of xs:base64Binary) then "xs:base64Binary" else $pdfsrc return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) } }pdfsrcitem()optsmap(*)item()java:org.apache.pdfbox.LoaderloadPDFhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionsstarts-withjava:org.apache.pdfbox.LoaderloadPDFhttp://basex.org/modules/fetchbinaryhttp://www.w3.org/2005/xpath-functionsstringjava:org.apache.pdfbox.LoaderloadPDFjava:org.apache.pdfbox.io.RandomAccessReadBufferedFilenewhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xqt-errorsdescriptionfunction pdfbox:open($pdfsrc as item(), $opts as map(*)) as item(){ try{ @@ -526,7 +604,7 @@ returned as string to avoid float rounding issues as xs:string{ PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() } -Save pdf $pdf to filesystem at $savepath , returns $savepathpdfbox:savefunction pdfbox:save ( $pdf as item(),$savepath as xs:string ) as xs:string { PDDocument:save($pdf, File:new($savepath)),$savepath }pdfitem()savepathxs:stringxs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentsavejava:java.io.Filenewfunction pdfbox:save($pdf as item(),$savepath as xs:string) +Save pdf $pdf to filesystem at $savepath , returns $savepathpdfbox:pdf-savefunction pdfbox:pdf-save ( $pdf as item(),$savepath as xs:string ) as xs:string { PDDocument:save($pdf, File:new($savepath)),$savepath }pdfitem()savepathxs:stringxs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentsavejava:java.io.Filenewfunction pdfbox:pdf-save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath } @@ -548,12 +626,13 @@ as xs:integer{ PDDocument:getNumberOfPages($pdf) } Pdf page as image (zero is cover) -options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi??pdfbox:page-renderfunction pdfbox:page-render ( $pdf as item(),$pageNo as xs:integer,$options as map(*) ) as xs:base64Binary { let $options:=map:merge(($options,map{"format":"jpg","scale":1})) let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() }pdfitem()pageNoxs:integeroptionsmap(*)xs:base64Binaryhttp://www.w3.org/2005/xpath-functions/mapmergejava:org.apache.pdfbox.rendering.PDFRenderernewjava:java.io.ByteArrayOutputStreamnewjava:javax.imageio.ImageIOwritejava:java.io.ByteArrayOutputStreamtoByteArrayfunction pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) +options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi??pdfbox:page-renderfunction pdfbox:page-render ( $pdf as item(),$pageNo as xs:integer,$options as map(*) ) as xs:base64Binary { let $options := map:merge(($options,map{"format":"jpg","scale":1})) let $bufferedImage := PDFRenderer:new($pdf) =>PDFRenderer:renderImage($pageNo,$options?scale) let $bytes := Q{java:java.io.ByteArrayOutputStream}new() let $_ := Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() }pdfitem()pageNoxs:integeroptionsmap(*)xs:base64Binaryhttp://www.w3.org/2005/xpath-functions/mapmergejava:org.apache.pdfbox.rendering.PDFRenderernewjava:java.io.ByteArrayOutputStreamnewjava:javax.imageio.ImageIOwritejava:java.io.ByteArrayOutputStreamtoByteArrayfunction pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) as xs:base64Binary{ - let $options:=map:merge(($options,map{"format":"jpg","scale":1})) - let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) - let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() - let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) + let $options := map:merge(($options,map{"format":"jpg","scale":1})) + let $bufferedImage := PDFRenderer:new($pdf) + =>PDFRenderer:renderImage($pageNo,$options?scale) + let $bytes := Q{java:java.io.ByteArrayOutputStream}new() + let $_ := Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() @@ -562,19 +641,19 @@ known property names sortedpdfbo as xs:string*{ $pdfbox:property-map=>map:keys()=>sort() } -return value of $property for $pdfpdfbox:propertyfunction pdfbox:property ( $pdf as item(),$property as xs:string ) as item()* { let $fns:= $pdfbox:property-map($property) return if(exists($fns)) then fold-left($fns, $pdf, function($result,$this as function(*)){$this($result)}) else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }pdfitem()propertyxs:stringitem()http://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xpath-functionsconcatorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:property($pdf as item(),$property as xs:string) +return value of $property for $pdfpdfbox:propertyfunction pdfbox:property ( $pdf as item(),$property as xs:string ) as item()* { let $fns:= $pdfbox:property-map($property) return if(exists($fns)) then fold-left($fns, $pdf, function($result,$this as function(*)){$result!$this(.)}) else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }pdfitem()propertyxs:stringitem()http://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xpath-functionsconcatorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:property($pdf as item(),$property as xs:string) as item()*{ let $fns:= $pdfbox:property-map($property) return if(exists($fns)) then fold-left($fns, $pdf, - function($result,$this as function(*)){$this($result)}) + function($result,$this as function(*)){$result!$this(.)}) else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) } summary CSV style info for all properties for $pdfpaths -pdfbox:reportfunction pdfbox:report ( $pdfpaths as xs:string* ) as map(*) { pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) }pdfpathsxs:stringmap(*)org.expkg_zone58.Pdfbox3reporthttp://www.w3.org/2005/xpath-functions/mapkeysorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:report($pdfpaths as xs:string*) +pdfbox:reportfunction pdfbox:report ( $pdfpaths as xs:string* ) as map(*) { pdfbox:report($pdfpaths,pdfbox:property-names()) }pdfpathsxs:stringmap(*)org.expkg_zone58.Pdfbox3reportorg.expkg_zone58.Pdfbox3property-namesfunction pdfbox:report($pdfpaths as xs:string*) as map(*){ - pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) + pdfbox:report($pdfpaths,pdfbox:property-names()) } summary CSV style info for named properties for $pdfpaths https://docs.basex.org/main/CSV_Functions#xquerypdfbox:reportfunction pdfbox:report ( $pdfpaths as item()*, $properties as xs:string* ) as map(*) { map{"names": array{"path",$properties}, "records": for $path in $pdfpaths let $name:=if($path instance of xs:base64Binary) then "binary" else $path return try{ let $pdf:=pdfbox:open($path) return (fold-left($properties, array{$name}, function($result as array(*),$prop as xs:string){ array:append($result, string(pdfbox:property($pdf, $prop)))} ), pdfbox:close($pdf) ) } catch *{ fold-left($properties, array{$name}, function($result as array(*),$prop as xs:string){ array:append($result, "#ERROR")} ) } } }pdfpathsitem()propertiesxs:stringmap(*)org.expkg_zone58.Pdfbox3openhttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functions/arrayappendhttp://www.w3.org/2005/xpath-functionsstringorg.expkg_zone58.Pdfbox3propertyorg.expkg_zone58.Pdfbox3closehttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functions/arrayappendfunction pdfbox:report($pdfpaths as item()*, $properties as xs:string*) @@ -600,18 +679,16 @@ as map(*){ } } +} +convenience function to save report() data to filepdfbox:report-savefunction pdfbox:report-save ( $data as map(*),$dest as xs:string ) as empty-sequence() { let $opts := map { "format":"xquery", "header":"yes", "separator" : "," } return file:write-text($dest,csv:serialize($data,$opts)) }datamap(*)destxs:stringempty-sequencehttp://expath.org/ns/filewrite-texthttp://basex.org/modules/csvserializefunction pdfbox:report-save($data as map(*),$dest as xs:string) +as empty-sequence(){ + let $opts := map { "format":"xquery", "header":"yes", "separator" : "," } + return file:write-text($dest,csv:serialize($data,$opts)) } -true if $pdf has an outlinepdfbox:hasOutlinefunction pdfbox:hasOutline ( $pdf as item() ) as xs:boolean { PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getDocumentOutline() =>exists() }pdfitem()xs:booleanjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:hasOutline($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getDocumentOutline() - =>exists() -} -true if $pdf has Labelspdfbox:hasLabelsfunction pdfbox:hasLabels ( $pdf as item() ) as xs:boolean { PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() =>exists() }pdfitem()xs:booleanjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:hasLabels($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() - =>exists() +number of outline itemspdfbox:number-of-bookmarksfunction pdfbox:number-of-bookmarks ( $pdf as item() ) as xs:integer { let $xml:=pdfbox:outline-xml($pdf) return count($xml//bookmark) }pdfitem()xs:integerorg.expkg_zone58.Pdfbox3outline-xmlhttp://www.w3.org/2005/xpath-functionscountfunction pdfbox:number-of-bookmarks($pdf as item()) +as xs:integer{ + let $xml:=pdfbox:outline-xml($pdf) + return count($xml//bookmark) } XMP metadata as "RDF" document usually rdf:RDF root, but sometimes x:xmpmetapdfbox:metadatafunction pdfbox:metadata ( $pdf as item() ) as document-node(element(*))? { let $m:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getMetadata() return if(exists($m)) then let $is:=PDMetadata:exportXMPMetadata($m) return pdfbox:do-until( map{"n":0,"data":""}, function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, function($output,$pos) { $output?n eq -1 } )?data=>parse-xml() else () }pdfitem()document-node(element(*))java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.common.PDMetadataexportXMPMetadataorg.expkg_zone58.Pdfbox3do-untilorg.expkg_zone58.Pdfbox3read-streamfunction pdfbox:metadata($pdf as item()) @@ -682,7 +759,7 @@ as element(outline)?{ then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline> else () } -recursive ouutline map to XMLpdfbox:bookmark-xmlfunction pdfbox:bookmark-xml ( $outline as map(*)* ) as element(bookmark)* { $outline! <bookmark title="{?title}" index="{?index}"> {?children!pdfbox:bookmark-xml(.)} </bookmark> }outlinemap(*)element(bookmark)org.expkg_zone58.Pdfbox3bookmark-xmlfunction pdfbox:bookmark-xml($outline as map(*)*) +Convert outline map to XMLpdfbox:bookmark-xmlfunction pdfbox:bookmark-xml ( $outline as map(*)* ) as element(bookmark)* { $outline! <bookmark title="{?title}" index="{?index}"> {?children!pdfbox:bookmark-xml(.)} </bookmark> }outlinemap(*)element(bookmark)org.expkg_zone58.Pdfbox3bookmark-xmlfunction pdfbox:bookmark-xml($outline as map(*)*) as element(bookmark)* { $outline! @@ -690,7 +767,7 @@ as element(bookmark)* {?children!pdfbox:bookmark-xml(.)} </bookmark> } -return bookmark info for $bookmark +Return bookmark info for $bookmark map{index:..,title:..,hasChildren:..}pdfbox:bookmarkfunction pdfbox:bookmark ( $bookmark as item(),$pdf as item() ) as map(*) { map{ "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} (:=>translate("�",""), :), "hasChildren": PDOutlineItem:hasChildren($bookmark) } }bookmarkitem()pdfitem()map(*)java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemfindDestinationPagejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetTitlejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemhasChildrenfunction pdfbox:bookmark($bookmark as item(),$pdf as item()) as map(*) { @@ -719,15 +796,73 @@ as xs:base64Binary let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() return (pdfbox:binary($a),pdfbox:close($a)) } -pageLabel for every page or empty if none -https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-exampleshttps://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-filespdfbox:labelsfunction pdfbox:labels ( $pdf as item() ) as xs:string* { let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() return if(exists($pagelabels)) then PDPageLabels:getLabelsByPageIndices($pagelabels) else () }pdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetLabelsByPageIndicesfunction pdfbox:labels($pdf as item()) +The number of labels defined in PDFpdfbox:number-of-labelsfunction pdfbox:number-of-labels ( $pdf as item() ) as xs:integer { let $labels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() return if(exists($labels)) then PDPageLabels:getPageRangeCount($labels) else 0 }pdfitem()xs:integerjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetPageRangeCountfunction pdfbox:number-of-labels($pdf as item()) +as xs:integer +{ + let $labels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return if(exists($labels)) + then PDPageLabels:getPageRangeCount($labels) + else 0 +} +pageLabel for every page from derived from page-ranges +The returned sequence will contain at MOST as much entries as the document has pages. +https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-exampleshttps://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-filespdfbox:labels-by-pagefunction pdfbox:labels-by-page ( $pdf as item() ) as xs:string* { PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() =>PDPageLabels:getLabelsByPageIndices() }pdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:labels-by-page($pdf as item()) as xs:string* { + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + =>PDPageLabels:getLabelsByPageIndices() +} +sequence of label ranges defined in PDF as formatted stringspdfbox:labels-as-stringsfunction pdfbox:labels-as-strings ( $pdf as item() ) as xs:string { let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() return $pagelabels !(0 to pdfbox:number-of-pages($pdf)-1) !pdfbox:label-as-string($pagelabels,.)=>string-join(",") }pdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogorg.expkg_zone58.Pdfbox3number-of-pagesorg.expkg_zone58.Pdfbox3label-as-stringfunction pdfbox:labels-as-strings($pdf as item()) +as xs:string{ let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() - return if(exists($pagelabels)) - then PDPageLabels:getLabelsByPageIndices($pagelabels) - else () + return $pagelabels + !(0 to pdfbox:number-of-pages($pdf)-1) + !pdfbox:label-as-string($pagelabels,.)=>string-join(",") + +} +get pagelabels existpdfbox:page-labelsfunction pdfbox:page-labels ( $pdf ) { PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() }pdfjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:page-labels($pdf) +{ + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() +} +label for $page formated as stringpdfbox:label-as-stringfunction pdfbox:label-as-string ( $pagelabels,$page as xs:integer ) as xs:string? { let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) return if(empty($label)) then () else let $start:= PDPageLabelRange:getStart($label) let $style := PDPageLabelRange:getStyle($label) let $prefix:= PDPageLabelRange:getPrefix($label) return string-join(($page, if(empty($style)) then "-" else $style, if(($start eq 1)) then "" else $start, if(exists($prefix)) then '*' || $prefix (:TODO double " :) )) }pagelabelspagexs:integerxs:stringjava:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetPageLabelRangehttp://www.w3.org/2005/xpath-functionsemptyjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStartjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStylejava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetPrefixhttp://www.w3.org/2005/xpath-functionsstring-joinhttp://www.w3.org/2005/xpath-functionsemptyhttp://www.w3.org/2005/xpath-functionsexistsfunction pdfbox:label-as-string($pagelabels,$page as xs:integer) +as xs:string?{ + let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) + return if(empty($label)) + then () + else + let $start:= PDPageLabelRange:getStart($label) + let $style := PDPageLabelRange:getStyle($label) + let $prefix:= PDPageLabelRange:getPrefix($label) + return string-join(($page, + if(empty($style)) then "-" else $style, + if(($start eq 1)) then "" else $start, + if(exists($prefix)) then '*' || $prefix (:TODO double " :) + )) +} +sequence of maps for each label inpdfbox:labels-as-mapfunction pdfbox:labels-as-map ( $pdf as item() ) as map(*)* { let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() return $pagelabels !(0 to pdfbox:number-of-pages($pdf)-1) !pdfbox:label-as-map($pagelabels,.) }pdfitem()map(*)java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogorg.expkg_zone58.Pdfbox3number-of-pagesorg.expkg_zone58.Pdfbox3label-as-mapfunction pdfbox:labels-as-map($pdf as item()) +as map(*)*{ + let $pagelabels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return $pagelabels + !(0 to pdfbox:number-of-pages($pdf)-1) + !pdfbox:label-as-map($pagelabels,.) +} +express label/page-range for $page as mappdfbox:label-as-mapfunction pdfbox:label-as-map ( $pagelabels,$page as xs:integer ) as map(*) { let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) return if(empty($label)) then () else map{ "index": $page, "prefix": PDPageLabelRange:getPrefix($label), "start": PDPageLabelRange:getStart($label), "style": PDPageLabelRange:getStyle($label) } }pagelabelspagexs:integermap(*)java:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetPageLabelRangehttp://www.w3.org/2005/xpath-functionsemptyjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetPrefixjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStartjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStylefunction pdfbox:label-as-map($pagelabels,$page as xs:integer) +as map(*) +{ + let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) + return if(empty($label)) + then () + else map{ + "index": $page, + "prefix": PDPageLabelRange:getPrefix($label), + "start": PDPageLabelRange:getStart($label), + "style": PDPageLabelRange:getStyle($label) + } } return text on $pageNopdfbox:page-textfunction pdfbox:page-text ( $pdf as item(), $pageNo as xs:integer ) as xs:string { let $tStripper := (# db:wrapjava instance #) { PDFTextStripper:new() => PDFTextStripper:setStartPage($pageNo) => PDFTextStripper:setEndPage($pageNo) } return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} }pdfitem()pageNoxs:integerxs:stringjava:org.apache.pdfbox.text.PDFTextStrippernewjava:org.apache.pdfbox.text.PDFTextStrippergetTextfunction pdfbox:page-text($pdf as item(), $pageNo as xs:integer) as xs:string{ @@ -738,18 +873,18 @@ as xs:string{ } return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} } -return size of $pageNo (zero based) +Return size of $pageNo (zero based) e.g. [0.0,0.0,168.0,239.52]pdfbox:page-media-boxfunction pdfbox:page-media-box ( $pdf as item(), $pageNo as xs:integer ) as xs:string { PDDocument:getPage($pdf, $pageNo) =>PDPage:getMediaBox() =>PDRectangle:toString() }pdfitem()pageNoxs:integerxs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetPagefunction pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) as xs:string{ PDDocument:getPage($pdf, $pageNo) =>PDPage:getMediaBox() =>PDRectangle:toString() } -version of Apache Pdfbox in use e.g. "3.0.4"pdfbox:versionfunction pdfbox:version ( ) as xs:string { Q{java:org.apache.pdfbox.util.Version}getVersion() }xs:stringjava:org.apache.pdfbox.util.VersiongetVersionfunction pdfbox:version() +Version of Apache Pdfbox in use e.g. "3.0.4"pdfbox:versionfunction pdfbox:version ( ) as xs:string { Q{java:org.apache.pdfbox.util.Version}getVersion() }xs:stringjava:org.apache.pdfbox.util.VersiongetVersionfunction pdfbox:version() as xs:string{ Q{java:org.apache.pdfbox.util.Version}getVersion() } -convert datepdfbox:gregToISOfunction pdfbox:gregToISO ( $item as item()? ) as xs:string? { if(exists($item)) then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() else () }itemitem()xs:stringhttp://www.w3.org/2005/xpath-functionsexistsjava:java.util.GregorianCalendartoZonedDateTimefunction pdfbox:gregToISO($item as item()?) +Convert datepdfbox:gregToISOfunction pdfbox:gregToISO ( $item as item()? ) as xs:string? { if(exists($item)) then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() else () }itemitem()xs:stringhttp://www.w3.org/2005/xpath-functionsexistsjava:java.util.GregorianCalendartoZonedDateTimefunction pdfbox:gregToISO($item as item()?) as xs:string?{ if(exists($item)) then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() diff --git a/docs/xqdoc/modules/F000001/xqparse.xml b/docs/xqdoc/modules/F000001/xqparse.xml index d14bbdb..572361d 100644 --- a/docs/xqdoc/modules/F000001/xqparse.xml +++ b/docs/xqdoc/modules/F000001/xqparse.xml @@ -1,8 +1,10 @@ xquery version '3.1'; (:~ -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , -requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.5.jar +A BaseX 10.7+ interface to pdfbox3 https://pdfbox.apache.org/ , +requires pdfbox jars on classpath, in lib/custom or xar +@note following the java source the terms outline and bookmark +refer to the same concept. Also label and (page)range are used interchangably +@note tested with pdfbox-app-3.0.5.jar @see https://pdfbox.apache.org/download.cgi @javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/ @author Andy Bunce 2025 @@ -15,6 +17,8 @@ tested with pdfbox-app-3.0.5.jar declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; +declare namespace PDPageLabelRange="java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange"; + declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage"; declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; @@ -25,6 +29,7 @@ tested with pdfbox-app-3.0.5.jar declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata"; declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream"; + declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; @@ -36,7 +41,7 @@ tested with pdfbox-app-3.0.5.jar -(:~ with-document pattern: open pdf,apply function, close pdf +(:~ "With-document" pattern: open pdf,apply $fn function, close pdf creates a local pdfobject and ensures it is closed after use e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) :) @@ -45,9 +50,9 @@ e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) as item()*{ let $pdf:=pdfbox:open($src) return try{ - $fn($pdf),pdfbox:close($pdf) + $fn($pdf),pdfbox:close($pdf) } catch *{ - pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) + pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) } }; @@ -61,7 +66,8 @@ e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) (:~ open pdf from file/url/binary, opts may have password , returns pdf object @param $pdfsrc a fetchable url or filepath, or xs:base64Binary item -@param $opts options otionally with map {"password":} +@param $opts options options include map {"password":} +@note fetch:binary for https will use a lot of memory here :) declare function pdfbox:open($pdfsrc as item(), $opts as map(*)) as item(){ @@ -90,7 +96,7 @@ returned as string to avoid float rounding issues }; (:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) -declare function pdfbox:save($pdf as item(),$savepath as xs:string) +declare function pdfbox:pdf-save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath }; @@ -122,10 +128,11 @@ returned as string to avoid float rounding issues options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) as xs:base64Binary{ - let $options:=map:merge(($options,map{"format":"jpg","scale":1})) - let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) - let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() - let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) + let $options := map:merge(($options,map{"format":"jpg","scale":1})) + let $bufferedImage := PDFRenderer:new($pdf) + =>PDFRenderer:renderImage($pageNo,$options?scale) + let $bytes := Q{java:java.io.ByteArrayOutputStream}new() + let $_ := Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() @@ -137,11 +144,11 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) values are sequences of functions to get property from $pdf object :) declare %private variable $pdfbox:property-map:=map{ - "pageCount": pdfbox:number-of-pages#1, + "#pages": pdfbox:number-of-pages#1, - "hasOutline": pdfbox:hasOutline#1, + "#bookmarks": pdfbox:number-of-bookmarks#1, - "hasLabels": pdfbox:hasLabels#1, + "#labels": pdfbox:number-of-labels#1, "specification":pdfbox:specification#1, @@ -169,7 +176,8 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) "modificationDate": (PDDocument:getDocumentInformation#1, PDDocumentInformation:getModificationDate#1, - pdfbox:gregToISO#1) + pdfbox:gregToISO#1), + "labels": pdfbox:labels-as-strings#1 }; (:~ known property names sorted :) @@ -185,7 +193,7 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) return if(exists($fns)) then fold-left($fns, $pdf, - function($result,$this as function(*)){$this($result)}) + function($result,$this as function(*)){$result!$this(.)}) else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }; @@ -193,7 +201,7 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) :) declare function pdfbox:report($pdfpaths as xs:string*) as map(*){ - pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) + pdfbox:report($pdfpaths,pdfbox:property-names()) }; (:~ summary CSV style info for named properties for $pdfpaths @@ -224,20 +232,18 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) } }; -(:~ true if $pdf has an outline :) -declare function pdfbox:hasOutline($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getDocumentOutline() - =>exists() +(:~ convenience function to save report() data to file :) +declare function pdfbox:report-save($data as map(*),$dest as xs:string) +as empty-sequence(){ + let $opts := map { "format":"xquery", "header":"yes", "separator" : "," } + return file:write-text($dest,csv:serialize($data,$opts)) }; -(:~ true if $pdf has Labels :) -declare function pdfbox:hasLabels($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() - =>exists() +(:~ number of outline items :) +declare function pdfbox:number-of-bookmarks($pdf as item()) +as xs:integer{ + let $xml:=pdfbox:outline-xml($pdf) + return count($xml//bookmark) }; (:~ XMP metadata as "RDF" document @@ -322,7 +328,7 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) else () }; -(:~ recursive ouutline map to XML :) +(:~ Convert outline map to XML :) declare %private function pdfbox:bookmark-xml($outline as map(*)*) as element(bookmark)* { @@ -332,7 +338,7 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) </bookmark> }; -(:~ return bookmark info for $bookmark +(:~ Return bookmark info for $bookmark @return map{index:..,title:..,hasChildren:..} :) declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) @@ -371,21 +377,92 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) return (pdfbox:binary($a),pdfbox:close($a)) }; +(:~ The number of labels defined in PDF :) +declare function pdfbox:number-of-labels($pdf as item()) +as xs:integer +{ + let $labels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return if(exists($labels)) + then PDPageLabels:getPageRangeCount($labels) + else 0 +}; -(:~ pageLabel for every page or empty if none +(:~ pageLabel for every page from derived from page-ranges +The returned sequence will contain at MOST as much entries as the document has pages. @see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples @see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files :) -declare function pdfbox:labels($pdf as item()) +declare function pdfbox:labels-by-page($pdf as item()) as xs:string* { + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + =>PDPageLabels:getLabelsByPageIndices() +}; + +(:~ sequence of label ranges defined in PDF as formatted strings :) +declare function pdfbox:labels-as-strings($pdf as item()) +as xs:string{ let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() - return if(exists($pagelabels)) - then PDPageLabels:getLabelsByPageIndices($pagelabels) - else () + return $pagelabels + !(0 to pdfbox:number-of-pages($pdf)-1) + !pdfbox:label-as-string($pagelabels,.)=>string-join(",") + }; +(:~ get pagelabels exist :) +declare function pdfbox:page-labels($pdf) +{ + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() +}; + +(:~ label for $page formated as string :) +declare function pdfbox:label-as-string($pagelabels,$page as xs:integer) +as xs:string?{ + let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) + return if(empty($label)) + then () + else + let $start:= PDPageLabelRange:getStart($label) + let $style := PDPageLabelRange:getStyle($label) + let $prefix:= PDPageLabelRange:getPrefix($label) + return string-join(($page, + if(empty($style)) then "-" else $style, + if(($start eq 1)) then "" else $start, + if(exists($prefix)) then '*' || $prefix (:TODO double " :) + )) +}; + +(:~ sequence of maps for each label in :) +declare function pdfbox:labels-as-map($pdf as item()) +as map(*)*{ + let $pagelabels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return $pagelabels + !(0 to pdfbox:number-of-pages($pdf)-1) + !pdfbox:label-as-map($pagelabels,.) +}; + +(:~ express label/page-range for $page as map :) +declare function pdfbox:label-as-map($pagelabels,$page as xs:integer) +as map(*) +{ + let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) + return if(empty($label)) + then () + else map{ + "index": $page, + "prefix": PDPageLabelRange:getPrefix($label), + "start": PDPageLabelRange:getStart($label), + "style": PDPageLabelRange:getStyle($label) + } +}; + + + (:~ return text on $pageNo :) declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer) as xs:string{ @@ -397,7 +474,7 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} }; -(:~ return size of $pageNo (zero based) +(:~ Return size of $pageNo (zero based) @result e.g. [0.0,0.0,168.0,239.52] :) declare function pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) @@ -407,13 +484,13 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) =>PDRectangle:toString() }; -(:~ version of Apache Pdfbox in use e.g. "3.0.4" :) +(:~ Version of Apache Pdfbox in use e.g. "3.0.4" :) declare function pdfbox:version() as xs:string{ Q{java:org.apache.pdfbox.util.Version}getVersion() }; -(:~ convert date :) +(:~ Convert date :) declare %private function pdfbox:gregToISO($item as item()?) as xs:string?{ diff --git a/docs/xqdoc/restxq.html b/docs/xqdoc/restxq.html index ff8aa06..972f729 100644 --- a/docs/xqdoc/restxq.html +++ b/docs/xqdoc/restxq.html @@ -7,4 +7,4 @@ Contents
                                          1. 1 Summary
                                          2. 2 Rest Paths

                                          Summary

                                          No RESTXQ usage

                                          Related documents
                                          ViewDescriptionFormat
                                          reportIndex of sourcesxhtml
                                          importsSummary of import usagexhtml
                                          imports-diagProject wide module imports as html mermaid class diagramhtml5
                                          imports-diag.mmdProject wide module imports as a mermaid class diagramtext
                                          annotationsSummary of XQuery annotation usexhtml
                                          xqdoca.xmlxqDocA run configuration report (XML)xml
                                          xqdoc-validatevalidate generated xqdoc filesxml

                                          Rest interface paths

                                          \ No newline at end of file +   on Tuesday, 3rd June 2025

                                          \ No newline at end of file diff --git a/docs/xqdoc/validation-report.xml b/docs/xqdoc/validation-report.xml index 62dfaaf..b6a91ea 100644 --- a/docs/xqdoc/validation-report.xml +++ b/docs/xqdoc/validation-report.xml @@ -1 +1 @@ -valid \ No newline at end of file +valid \ No newline at end of file diff --git a/docs/xqdoc/xqdoca.xml b/docs/xqdoc/xqdoca.xml index 92a25c7..3e91b97 100644 --- a/docs/xqdoc/xqdoca.xml +++ b/docs/xqdoc/xqdoca.xml @@ -1,4 +1,4 @@ -0.9.1docs/xqdoc/ +0.9.1docs/xqdoc/ report restxq imports diff --git a/package.json b/package.json index 02f6fc7..586c591 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdfbox", - "version": "0.3.6", + "version": "0.3.8", "description": "A BaseX interface to Apache Pdfbox version 3", "main": "src/Pdfbox3.xqm", "homepage": "https://github.com/expkg-zone58/pdfbox#readme", diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index e65c218..7afbf8c 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -1,8 +1,10 @@ xquery version '3.1'; (:~ -A BaseX 10.7+ interface to pdfbox 3.0 https://pdfbox.apache.org/ , -requires pdfbox jars on classpath, i.e. in custom or xar -tested with pdfbox-app-3.0.5.jar +A BaseX 10.7+ interface to pdfbox3 https://pdfbox.apache.org/ , +requires pdfbox jars on classpath, in lib/custom or xar +@note following the java source the terms outline and bookmark +refer to the same concept. Also label and (page)range are used interchangably +@note tested with pdfbox-app-3.0.5.jar @see https://pdfbox.apache.org/download.cgi @javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.5/ @author Andy Bunce 2025 @@ -15,6 +17,8 @@ declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; +declare namespace PDPageLabelRange="java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange"; + declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; declare namespace PDPage ="java:org.apache.pdfbox.pdmodel.PDPage"; declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; @@ -25,6 +29,7 @@ declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; declare namespace PDMetadata="java:org.apache.pdfbox.pdmodel.common.PDMetadata"; declare namespace COSInputStream="java:org.apache.pdfbox.cos.COSInputStream"; + declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; @@ -36,7 +41,7 @@ declare namespace File ="java:java.io.File"; -(:~ "With-document" pattern: open pdf,apply function, close pdf +(:~ "With-document" pattern: open pdf,apply $fn function, close pdf creates a local pdfobject and ensures it is closed after use e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) :) @@ -45,9 +50,9 @@ declare function pdfbox:with-pdf($src as xs:string, as item()*{ let $pdf:=pdfbox:open($src) return try{ - $fn($pdf),pdfbox:close($pdf) + $fn($pdf),pdfbox:close($pdf) } catch *{ - pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) + pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) } }; @@ -61,7 +66,8 @@ pdfbox:open($pdfsrc, map{}) (:~ open pdf from file/url/binary, opts may have password , returns pdf object @param $pdfsrc a fetchable url or filepath, or xs:base64Binary item -@param $opts options otionally with map {"password":} +@param $opts options options include map {"password":} +@note fetch:binary for https will use a lot of memory here :) declare function pdfbox:open($pdfsrc as item(), $opts as map(*)) as item(){ @@ -90,7 +96,7 @@ as xs:string{ }; (:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) -declare function pdfbox:save($pdf as item(),$savepath as xs:string) +declare function pdfbox:pdf-save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath }; @@ -122,10 +128,11 @@ as xs:integer{ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) declare function pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) as xs:base64Binary{ - let $options:=map:merge(($options,map{"format":"jpg","scale":1})) - let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) - let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() - let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) + let $options := map:merge(($options,map{"format":"jpg","scale":1})) + let $bufferedImage := PDFRenderer:new($pdf) + =>PDFRenderer:renderImage($pageNo,$options?scale) + let $bytes := Q{java:java.io.ByteArrayOutputStream}new() + let $_ := Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() @@ -137,11 +144,11 @@ as xs:base64Binary{ values are sequences of functions to get property from $pdf object :) declare %private variable $pdfbox:property-map:=map{ - "pageCount": pdfbox:number-of-pages#1, + "#pages": pdfbox:number-of-pages#1, - "hasOutline": pdfbox:hasOutline#1, + "#bookmarks": pdfbox:number-of-bookmarks#1, - "hasLabels": pdfbox:hasLabels#1, + "#labels": pdfbox:number-of-labels#1, "specification":pdfbox:specification#1, @@ -169,7 +176,8 @@ declare %private variable $pdfbox:property-map:=map{ "modificationDate": (PDDocument:getDocumentInformation#1, PDDocumentInformation:getModificationDate#1, - pdfbox:gregToISO#1) + pdfbox:gregToISO#1), + "labels": pdfbox:labels-as-strings#1 }; (:~ known property names sorted :) @@ -185,7 +193,7 @@ as item()*{ return if(exists($fns)) then fold-left($fns, $pdf, - function($result,$this as function(*)){$this($result)}) + function($result,$this as function(*)){$result!$this(.)}) else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }; @@ -193,7 +201,7 @@ as item()*{ :) declare function pdfbox:report($pdfpaths as xs:string*) as map(*){ - pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) + pdfbox:report($pdfpaths,pdfbox:property-names()) }; (:~ summary CSV style info for named properties for $pdfpaths @@ -224,20 +232,18 @@ as map(*){ } }; -(:~ true if $pdf has an outline :) -declare function pdfbox:hasOutline($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getDocumentOutline() - =>exists() +(:~ convenience function to save report() data to file :) +declare function pdfbox:report-save($data as map(*),$dest as xs:string) +as empty-sequence(){ + let $opts := map { "format":"xquery", "header":"yes", "separator" : "," } + return file:write-text($dest,csv:serialize($data,$opts)) }; -(:~ true if $pdf has Labels :) -declare function pdfbox:hasLabels($pdf as item()) -as xs:boolean{ - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() - =>exists() +(:~ number of outline items :) +declare function pdfbox:number-of-bookmarks($pdf as item()) +as xs:integer{ + let $xml:=pdfbox:outline-xml($pdf) + return count($xml//bookmark) }; (:~ XMP metadata as "RDF" document @@ -371,21 +377,92 @@ as xs:base64Binary return (pdfbox:binary($a),pdfbox:close($a)) }; +(:~ The number of labels defined in PDF :) +declare function pdfbox:number-of-labels($pdf as item()) +as xs:integer +{ + let $labels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return if(exists($labels)) + then PDPageLabels:getPageRangeCount($labels) + else 0 +}; -(:~ pageLabel for every page or empty if none +(:~ pageLabel for every page from derived from page-ranges +The returned sequence will contain at MOST as much entries as the document has pages. @see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples @see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files :) -declare function pdfbox:labels($pdf as item()) +declare function pdfbox:labels-by-page($pdf as item()) as xs:string* { + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + =>PDPageLabels:getLabelsByPageIndices() +}; + +(:~ sequence of label ranges defined in PDF as formatted strings :) +declare function pdfbox:labels-as-strings($pdf as item()) +as xs:string{ let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() - return if(exists($pagelabels)) - then PDPageLabels:getLabelsByPageIndices($pagelabels) - else () + return $pagelabels + !(0 to pdfbox:number-of-pages($pdf)-1) + !pdfbox:label-as-string($pagelabels,.)=>string-join(",") + }; +(:~ get pagelabels exist :) +declare function pdfbox:page-labels($pdf) +{ + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() +}; + +(:~ label for $page formated as string :) +declare function pdfbox:label-as-string($pagelabels,$page as xs:integer) +as xs:string?{ + let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) + return if(empty($label)) + then () + else + let $start:= PDPageLabelRange:getStart($label) + let $style := PDPageLabelRange:getStyle($label) + let $prefix:= PDPageLabelRange:getPrefix($label) + return string-join(($page, + if(empty($style)) then "-" else $style, + if(($start eq 1)) then "" else $start, + if(exists($prefix)) then '*' || $prefix (:TODO double " :) + )) +}; + +(:~ sequence of maps for each label in :) +declare function pdfbox:labels-as-map($pdf as item()) +as map(*)*{ + let $pagelabels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return $pagelabels + !(0 to pdfbox:number-of-pages($pdf)-1) + !pdfbox:label-as-map($pagelabels,.) +}; + +(:~ express label/page-range for $page as map :) +declare function pdfbox:label-as-map($pagelabels,$page as xs:integer) +as map(*) +{ + let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) + return if(empty($label)) + then () + else map{ + "index": $page, + "prefix": PDPageLabelRange:getPrefix($label), + "start": PDPageLabelRange:getStart($label), + "style": PDPageLabelRange:getStyle($label) + } +}; + + + (:~ return text on $pageNo :) declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer) as xs:string{