From f96c64286dbc28ddecd4e252b8d0c3fab0624539 Mon Sep 17 00:00:00 2001 From: Andy Bunce Date: Mon, 23 Jun 2025 20:21:26 +0100 Subject: [PATCH] [mod] docs --- .xqdoca | 1 + changelog.md | 2 + doc.md => docs/guide.md | 23 ++- docs/xqdoc/annotations.html | 4 +- docs/xqdoc/imports.html | 2 +- docs/xqdoc/index.html | 8 +- docs/xqdoc/modules/F000001/index.html | 201 +++++++++++-------------- docs/xqdoc/modules/F000001/xqdoc.xml | 173 ++++++++++----------- docs/xqdoc/modules/F000001/xqparse.xml | 57 +++---- docs/xqdoc/restxq.html | 2 +- docs/xqdoc/validation-report.xml | 2 +- docs/xqdoc/xqdoca.xml | 4 +- package.json | 6 +- readme.md | 2 +- samples.pdf/readme.md | 4 +- src/Pdfbox3.xqm | 57 +++---- 16 files changed, 252 insertions(+), 296 deletions(-) rename doc.md => docs/guide.md (84%) diff --git a/.xqdoca b/.xqdoca index fd65659..841b9d6 100644 --- a/.xqdoca +++ b/.xqdoca @@ -1,4 +1,5 @@ src/ docs/xqdoc/ + true \ No newline at end of file diff --git a/changelog.md b/changelog.md index 217bb34..9ed79c0 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,5 @@ +# 0.5.0 2025-06-08 +* remove `hasChildren` from outline map # 0.4.0 2025-06-04 * ADD Label access * various renames diff --git a/doc.md b/docs/guide.md similarity index 84% rename from doc.md rename to docs/guide.md index 1029ff9..c47ac4d 100644 --- a/doc.md +++ b/docs/guide.md @@ -2,7 +2,11 @@ ## Introduction -The `Pdfbox3.xqm` library is an XQuery module designed to interface with **Apache PDFBox 3.0**, a powerful Java library for working with PDF documents. This module allows you to perform various operations on PDF files, such as extracting text, rendering pages as images, managing outlines, and more. The library is distributed as a **XAR (XQuery Archive) file**, which includes the necessary PDFBox JAR files, making it easy to install and use in BaseX 10.7+. +The `Pdfbox3.xqm` XQuery library module enables features from **Apache PDFBox 3.0** to be called from `BaseX`. + +>The [Apache PDFBox®](https://pdfbox.apache.org/) library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. + +The library is distributed as a **XAR (XQuery Archive) file**, which includes the necessary PDFBox JAR files, making it easy to install and use in BaseX 10.7+. --- @@ -12,18 +16,13 @@ The `Pdfbox3.xqm` library is an XQuery module designed to interface with **Apach The library is distributed as a XAR file that includes the required PDFBox JAR files. You can obtain the XAR file from the distribution source (e.g., a repository or a shared location). ### 2. Install the XAR File in BaseX -To install the XAR file in BaseX, follow these steps: +The latest version is avaiable at https://github.com/expkg-zone58/pdfbox/releases. +The `XAR` can be installed into the repository. For example: -1. Open the BaseX GUI or command-line interface. -2. Use the `REPO INSTALL` command to install the XAR file: - - ```xquery - REPO INSTALL path/to/pdfbox3.xar - ``` - - Replace `path/to/pdfbox3.xar` with the actual path to the XAR file. - -3. Verify the installation by listing the installed packages: +``` +REPO INSTALL https://github.com/expkg-zone58/pdfbox/releases/download/v0.4.0/pdfbox-0.4.0.xar +``` +### 3. Verify the installation by listing the installed packages: ```xquery REPO LIST diff --git a/docs/xqdoc/annotations.html b/docs/xqdoc/annotations.html index f1c7141..8235a0e 100644 --- a/docs/xqdoc/annotations.html +++ b/docs/xqdoc/annotations.html @@ -6,6 +6,6 @@ / Annotations importsimports-diagimports-diag.mmdreportrestxqxqdoc-validatexqdoca.xml

Contents -

  1. Summary
  2. Annotations
    1. 2.1 http://www.w3.org/2012/xquery

Summary

This project uses 1 annotation namespaces.

Related documents
ViewDescriptionFormat
reportIndex of sourcesxhtml
restxqSummary of REST interfacexhtml
importsSummary of import usagexhtml
imports-diagProject wide module imports as html mermaid class diagramhtml5
imports-diag.mmdProject wide module imports as a mermaid class diagramtext
xqdoca.xmlxqDocA run configuration report (XML)xml
xqdoc-validatevalidate generated xqdoc filesxml

Annotations

2.1 http://www.w3.org/2012/xquery

private
\ No newline at end of file +   on Monday, 9th June 2025

\ No newline at end of file diff --git a/docs/xqdoc/imports.html b/docs/xqdoc/imports.html index 06a69c6..38768fa 100644 --- a/docs/xqdoc/imports.html +++ b/docs/xqdoc/imports.html @@ -6,4 +6,4 @@ Contents
  1. Summary
  2. Imports

    Summary

    Lists all modules imported.

    Related documents
    ViewDescriptionFormat
    reportIndex of sourcesxhtml
    restxqSummary of REST interfacexhtml
    imports-diagProject wide module imports as html mermaid class diagramhtml5
    imports-diag.mmdProject wide module imports as a mermaid class diagramtext
    annotationsSummary of XQuery annotation usexhtml
    xqdoca.xmlxqDocA run configuration report (XML)xml
    xqdoc-validatevalidate generated xqdoc filesxml

    Imports (0)

    \ No newline at end of file +   on Monday, 9th June 2025

    \ No newline at end of file diff --git a/docs/xqdoc/index.html b/docs/xqdoc/index.html index 83d6779..e1a1ab7 100644 --- a/docs/xqdoc/index.html +++ b/docs/xqdoc/index.html @@ -6,9 +6,9 @@ 1 XQuery source files, and uses 1 annotation namespaces.

    This document was built from source folder C:/Users/mrwhe/git/expkg-zone58/pdfbox/src/ on - Wednesday, 4th June 2025.

    Related documents
    ViewDescriptionFormat
    reportIndex of sourcesxhtml
    restxqSummary of REST interfacexhtml
    importsSummary of import usagexhtml
    imports-diagProject wide module imports as html mermaid class diagramhtml5
    imports-diag.mmdProject wide module imports as a mermaid class diagramtext
    annotationsSummary of XQuery annotation usexhtml
    xqdoca.xmlxqDocA run configuration report (XML)xml
    xqdoc-validatevalidate generated xqdoc filesxml

    XQuery Main (0)

    None

    XQuery Library (1)

    UriPrefixDescriptionUseAMetrics
    org.expkg_zone58.Pdfbox3pdfbox + Monday, 9th June 2025.

    Related documents
    ViewDescriptionFormat
    reportIndex of sourcesxhtml
    restxqSummary of REST interfacexhtml
    importsSummary of import usagexhtml
    imports-diagProject wide module imports as html mermaid class diagramhtml5
    imports-diag.mmdProject wide module imports as a mermaid class diagramtext
    annotationsSummary of XQuery annotation usexhtml
    xqdoca.xmlxqDocA run configuration report (XML)xml
    xqdoc-validatevalidate generated xqdoc filesxml

    XQuery Main (0)

    None

    XQuery Library (1)

    UriPrefixDescriptionUseAMetrics
    org.expkg_zone58.Pdfbox3pdfbox -A BaseX 10.7+ interface to pdfbox3 https://...
    0
    Library
    ↖0
    P
    V#1
    F#37

    File view (1)

    Annotation namespaces (1)

    A total of 8 annotations are defined. -

    http://www.w3.org/2012/xquery

    0
    Library
    ↖0
    P
    V#1
    F#36

    File view (1)

    Annotation namespaces (1)

    A total of 7 annotations are defined. +

    http://www.w3.org/2012/xquery

    private7
    \ No newline at end of file +   on Monday, 9th June 2025

    \ No newline at end of file diff --git a/docs/xqdoc/modules/F000001/index.html b/docs/xqdoc/modules/F000001/index.html index d8bbd3c..f60d577 100644 --- a/docs/xqdoc/modules/F000001/index.html +++ b/docs/xqdoc/modules/F000001/index.html @@ -1,12 +1,7 @@ src - xqDocA - xqDocA

    org.expkg_zone58.Pdfbox3  library module
    P

    Summary

    - -A BaseX 10.7+ interface to pdfbox3 https://pdfbox.apache.org/ , -requires pdfbox jars on classpath, in lib/custom or xar -refer to the same concept. Also label and (page)range are used interchangably -
    See also
    Authors
    • Andy Bunce 2025
    Custom

    Functions

    4.1 pdfbox:binary

    Arities: #1

    Summary
    -Create binary representation of $pdf object as xs:base64Binary
    Signatures
    pdfbox:binary +}

    Functions

    4.1 pdfbox:binary

    Arities: #1

    Summary
    +Create binary representation (xs:base64Binary) of $pdf object
    Signatures
    pdfbox:binary ( $pdf as item() ) as xs:base64Binary
    Parameters
    • pdf as item()
    Return
    • xs:base64Binary
    Referenced by 1 functions from 1 modules
    References 3 functions from 2 modules
    • {java:java.io.ByteArrayOutputStream}new#0
    • {java:java.io.ByteArrayOutputStream}toByteArray#1
    • {java:org.apache.pdfbox.pdmodel.PDDocument}save#2
    Source ( 7 lines)
    function pdfbox:binary($pdf as item())
     as xs:base64Binary{
    @@ -61,43 +56,30 @@ as xs:base64Binary{
        let $_:=PDDocument:save($pdf, $bytes)
        return  Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
              =>convert:integers-to-base64()
    -}

    4.2 pdfbox:bookmark

    Arities: #2P

    Summary
    -Return bookmark info for $bookmark -
    Signatures
    pdfbox:bookmark - ( - $bookmark as item(), $pdf as item() ) as map(*)
    Parameters
    • bookmark as item()
    • pdf as item()
    Return
    • map(*) map{index:..,title:..,hasChildren:..}
    Referenced by 1 functions from 1 modules
    References 3 functions from 1 modules
    • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}findDestinationPage#2
    • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getTitle#1
    • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}hasChildren#1
    Annotations (1)
    %private()
    Source ( 10 lines)
    function pdfbox:bookmark($bookmark as item(),$pdf as item())
    -as map(*)
    -{
    - map{ 
    -  "index":  PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf),
    -  "title":  (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}
    -  (:=>translate("�",""), :),
    -  "hasChildren": PDOutlineItem:hasChildren($bookmark)
    -  }
    -}

    4.3 pdfbox:bookmark-xml

    Arities: #1P

    Summary
    +}

    4.2 pdfbox:bookmark-xml

    Arities: #1P

    Summary
    Convert outline map to XML
    Signatures
    pdfbox:bookmark-xml ( - $outline as map(*)* ) as element(bookmark)*
    Parameters
    • outline as map(*)*
    Return
    • element(bookmark) *
    Referenced by 2 functions from 1 modules
    References 1 functions from 1 modules
    Annotations (1)
    %private()
    Source ( 8 lines)
    function pdfbox:bookmark-xml($outline as map(*)*)
    +			$outline as map(*)* ) as element(bookmark)*
    Parameters
    • outline as map(*)*
    Return
    • element(bookmark)*
    Referenced by 2 functions from 1 modules
    References 1 functions from 1 modules
    Annotations (1)
    %private()
    Source ( 8 lines)
    function pdfbox:bookmark-xml($outline as map(*)*)
     as element(bookmark)*
     {
       $outline!
       <bookmark title="{?title}" index="{?index}">
         {?children!pdfbox:bookmark-xml(.)}
       </bookmark>
    -}

    4.4 pdfbox:close

    Arities: #1

    Summary
    +}

    4.3 pdfbox:close

    Arities: #1

    Summary
    Release any resources related to $pdf
    Signatures
    pdfbox:close ( - $pdf as item() ) as empty-sequence
    Parameters
    • pdf as item()
    Return
    • empty-sequence
    Referenced by 3 functions from 1 modules
    References 1 functions from 1 modules
    • {java:org.apache.pdfbox.pdmodel.PDDocument}close#1
    Source ( 6 lines)
    function pdfbox:close($pdf as item())
    +			$pdf as item() ) as empty-sequence()
    Parameters
    Return
    Referenced by 3 functions from 1 modules
    References 1 functions from 1 modules
    Source ( 6 lines)
    function pdfbox:close($pdf as item())
     as empty-sequence(){
       (# db:wrapjava void #) {
          PDDocument:close($pdf)
       }
    -}

    4.5 pdfbox:do-until

    Arities: #3P

    Summary
    +}

    4.4 pdfbox:do-until

    Arities: #3P

    Summary
    fn:do-until shim for BaseX 9+10 if fn:do-until not found use hof:until, note: $pos always zero
    Signatures
    pdfbox:do-until ( - $input as item()*, $action as function(item()*, xs:integer) as item()*, $predicate as function(item()*, xs:integer) as xs:boolean? ) as item()*
    Parameters
    • input as item()*
    • action as function(item()*, xs:integer) as item()*
    • predicate as function(item()*, xs:integer) as xs:boolean?
    Return
    • item() *
    Referenced by 2 functions from 1 modules
    References 5 functions from 2 modules
    • {http://www.w3.org/2001/XMLSchema}QName#1
    • {http://www.w3.org/2005/xpath-functions}QName#2
    • {http://www.w3.org/2005/xpath-functions}error#2
    • {http://www.w3.org/2005/xpath-functions}exists#1
    • {http://www.w3.org/2005/xpath-functions}function-lookup#2
    Annotations (1)
    %private()
    Source ( 15 lines)
    function pdfbox:do-until(
    +			$input as item()*, $action as function(item()*, xs:integer) as item()*, $predicate as function(item()*, xs:integer) as xs:boolean? ) as item()*
    Parameters
    Return
    Referenced by 2 functions from 1 modules
    References 5 functions from 2 modules
    Annotations (1)
    %private()
    Source ( 15 lines)
    function pdfbox:do-until(
      $input 	as item()*, 	
      $action 	as function(item()*, xs:integer) as item()*, 	
      $predicate 	as function(item()*, xs:integer) as xs:boolean? 	
    @@ -111,7 +93,7 @@ if  fn:do-until not found use hof:until, note: $pos always zero
                           then $hof($predicate(?,0),$action(?,0),$input)
                           else error(xs:QName('pdfbox:do-until'),"No implementation do-until found")
     
    -}

    4.6 pdfbox:extract-range

    Arities: #3

    Summary
    +}

    4.5 pdfbox:extract-range

    Arities: #3

    Summary
    Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based)
    Signatures
    pdfbox:extract-range ( @@ -121,10 +103,10 @@ as xs:base64Binary { let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() return (pdfbox:binary($a),pdfbox:close($a)) -}

    4.7 pdfbox:find-page

    Arities: #2

    Summary
    +}

    4.6 pdfbox:find-page

    Arities: #2

    Summary
    pageIndex of $page in $pdf
    Signatures
    pdfbox:find-page ( - $page as item()?, $pdf as item() ) as item()?
    Parameters
    • page as item()?
    • pdf as item()
    Return
    • item() ?
    Referenced by 0 functions from 0 modules
      References 2 functions from 2 modules
      • {http://www.w3.org/2005/xpath-functions}exists#1
      • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
      Source ( 10 lines)
      function pdfbox:find-page(
      +			$page as item()?, $pdf as item() ) as item()?
      Parameters
      Return
      Referenced by 0 functions from 0 modules
      References 2 functions from 2 modules
      Source ( 10 lines)
      function pdfbox:find-page(
          $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
          $pdf as item())
       as item()?
      @@ -133,15 +115,15 @@ as item()?
         then PDDocument:getDocumentCatalog($pdf)
             =>PDDocumentCatalog:getPages()
             =>PDPageTree:indexOf($page)
      -}

      4.8 pdfbox:gregToISO

      Arities: #1P

      Summary
      +}

      4.7 pdfbox:gregToISO

      Arities: #1P

      Summary
      Convert date
      Signatures
      pdfbox:gregToISO ( - $item as item()? ) as xs:string?
      Parameters
      • item as item()?
      Return
      • xs:string ?
      Referenced by 0 functions from 0 modules
        References 2 functions from 2 modules
        • {http://www.w3.org/2005/xpath-functions}exists#1
        • {java:java.util.GregorianCalendar}toZonedDateTime#1
        Annotations (1)
        %private()
        Source ( 6 lines)
        function pdfbox:gregToISO($item as item()?)
        +			$item as item()? ) as xs:string?
        Parameters
        Return
        Referenced by 0 functions from 0 modules
        References 2 functions from 2 modules
        Annotations (1)
        %private()
        Source ( 6 lines)
        function pdfbox:gregToISO($item as item()?)
         as xs:string?{
          if(exists($item))
          then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
          else ()
        -}

        4.9 pdfbox:label-as-map

        Arities: #2

        Summary
        +}

        4.8 pdfbox:label-as-map

        Arities: #2

        Summary
        label/page-range for $page as map
        Signatures
        pdfbox:label-as-map ( $pagelabels, $page as xs:integer ) as map(*)
        Parameters
        • pagelabels as 
        • page as xs:integer
        Return
        • map(*)
        Referenced by 1 functions from 1 modules
        References 5 functions from 3 modules
        • {http://www.w3.org/2005/xpath-functions}empty#1
        • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getPrefix#1
        • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getStart#1
        • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getStyle#1
        • {java:org.apache.pdfbox.pdmodel.common.PDPageLabels}getPageLabelRange#2
        Source ( 13 lines)
        function pdfbox:label-as-map($pagelabels,$page as  xs:integer)
        @@ -156,10 +138,10 @@ as map(*)
               "start":  PDPageLabelRange:getStart($label),
               "style":  PDPageLabelRange:getStyle($label)
               }
        -}

        4.10 pdfbox:label-as-string

        Arities: #2

        Summary
        +}

        4.9 pdfbox:label-as-string

        Arities: #2

        Summary
        label for $page formated as string, empty if none
        Signatures
        pdfbox:label-as-string ( - $pagelabels, $page as xs:integer ) as xs:string?
        Parameters
        • pagelabels as 
        • page as xs:integer
        Return
        • xs:string ?
        Referenced by 1 functions from 1 modules
        References 7 functions from 3 modules
        • {http://www.w3.org/2005/xpath-functions}empty#1
        • {http://www.w3.org/2005/xpath-functions}exists#1
        • {http://www.w3.org/2005/xpath-functions}string-join#1
        • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getPrefix#1
        • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getStart#1
        • {java:org.apache.pdfbox.pdmodel.common.PDPageLabelRange}getStyle#1
        • {java:org.apache.pdfbox.pdmodel.common.PDPageLabels}getPageLabelRange#2
        Source ( 15 lines)
        function pdfbox:label-as-string($pagelabels,$page as  xs:integer)
        +			$pagelabels, $page as xs:integer ) as xs:string?
        Parameters
        Return
        Referenced by 1 functions from 1 modules
        References 7 functions from 3 modules
        Source ( 15 lines)
        function pdfbox:label-as-string($pagelabels,$page as  xs:integer)
         as xs:string?{
           let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page)
           return  if(empty($label))
        @@ -173,17 +155,17 @@ as xs:string?{
                                         if(($start eq 1)) then "" else $start,
                                         if(exists($prefix)) then '*' || $prefix  (:TODO double " :)
                             ))
        -}

        4.11 pdfbox:labels-as-map

        Arities: #1

        Summary
        +}

        4.10 pdfbox:labels-as-map

        Arities: #1

        Summary
        sequence of maps for each label/page range defined in $pdf
        Signatures
        pdfbox:labels-as-map ( - $pdf as item() ) as map(*)*
        Parameters
        • pdf as item()
        Return
        • map(*) *
        Referenced by 0 functions from 0 modules
          References 3 functions from 2 modules
          Source ( 8 lines)
          function pdfbox:labels-as-map($pdf as item())
          +			$pdf as item() ) as map(*)*
          Parameters
          Return
          Referenced by 0 functions from 0 modules
          References 3 functions from 2 modules
          Source ( 8 lines)
          function pdfbox:labels-as-map($pdf as item())
           as map(*)*{
             let $pagelabels:=PDDocument:getDocumentCatalog($pdf)
                              =>PDDocumentCatalog:getPageLabels()
             return  $pagelabels
                     !(0 to pdfbox:number-of-pages($pdf)-1)
                     !pdfbox:label-as-map($pagelabels,.)
          -}

          4.12 pdfbox:labels-as-string

          Arities: #1

          Summary
          +}

          4.11 pdfbox:labels-as-string

          Arities: #1

          Summary
          sequence of label ranges defined in PDF as formatted strings
          Signatures
          pdfbox:labels-as-string ( @@ -195,22 +177,22 @@ as xs:string{ !(0 to pdfbox:number-of-pages($pdf)-1) !pdfbox:label-as-string($pagelabels,.)=>string-join("&#10;") -}

          4.13 pdfbox:labels-by-page

          Arities: #1

          Summary
          +}

          4.12 pdfbox:labels-by-page

          Arities: #1

          Summary
          pageLabel for every page from derived from page-ranges The returned sequence will contain at MOST as much entries as the document has pages.
          Signatures
          pdfbox:labels-by-page ( - $pdf as item() ) as xs:string*
          Parameters
          • pdf as item()
          Return
          • xs:string *
          Tags
          Referenced by 0 functions from 0 modules
            References 1 functions from 1 modules
            • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
            Source ( 7 lines)
            function pdfbox:labels-by-page($pdf as item())
            +			$pdf as item() ) as xs:string*
            Parameters
            Return
            See also
            Referenced by 0 functions from 0 modules
            References 1 functions from 1 modules
            Source ( 7 lines)
            function pdfbox:labels-by-page($pdf as item())
             as xs:string*
             {
               PDDocument:getDocumentCatalog($pdf)
               =>PDDocumentCatalog:getPageLabels()
               =>PDPageLabels:getLabelsByPageIndices()
            -}

            4.14 pdfbox:metadata

            Arities: #1

            Summary
            +}

            4.13 pdfbox:metadata

            Arities: #1

            Summary
            XMP metadata as "RDF" document
            Signatures
            pdfbox:metadata ( - $pdf as item() ) as document-node(element(*))?
            Parameters
            • pdf as item()
            Return
            • document-node(element(*)) ?
            Tags
            • @note: + $pdf as item() ) as document-node(element(*))?
            Parameters
            Return
            Tags
            • @note: usually rdf:RDF root, but sometimes x:xmpmeta
            Referenced by 0 functions from 0 modules
            References 5 functions from 4 modules
            Source ( 17 lines)
            function pdfbox:metadata($pdf as item())
             as document-node(element(*))?
             {
            @@ -227,14 +209,14 @@ as document-node(element(*))?
                                     function($output,$pos) { $output?n eq -1 }     
                                  )?data=>parse-xml()
                       else ()
            -}

            4.15 pdfbox:number-of-bookmarks

            Arities: #1

            Summary
            +}

            4.14 pdfbox:number-of-bookmarks

            Arities: #1

            Summary
            The number of outline items defined in $pdf
            Signatures
            pdfbox:number-of-bookmarks ( $pdf as item() ) as xs:integer
            Parameters
            • pdf as item()
            Return
            • xs:integer
            Referenced by 0 functions from 0 modules
              References 2 functions from 2 modules
              Source ( 5 lines)
              function pdfbox:number-of-bookmarks($pdf as item())
               as xs:integer{
                 let $xml:=pdfbox:outline-xml($pdf)
                 return count($xml//bookmark)
              -}

              4.16 pdfbox:number-of-labels

              Arities: #1

              Summary
              +}

              4.15 pdfbox:number-of-labels

              Arities: #1

              Summary
              The number of labels defined in PDF
              Signatures
              pdfbox:number-of-labels ( $pdf as item() ) as xs:integer
              Parameters
              • pdf as item()
              Return
              • xs:integer
              Referenced by 0 functions from 0 modules
                References 3 functions from 3 modules
                • {http://www.w3.org/2005/xpath-functions}exists#1
                • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
                • {java:org.apache.pdfbox.pdmodel.common.PDPageLabels}getPageRangeCount#1
                Source ( 9 lines)
                function pdfbox:number-of-labels($pdf as item())
                @@ -245,14 +227,15 @@ as xs:integer
                   return if(exists($labels)) 
                          then PDPageLabels:getPageRangeCount($labels)
                          else 0
                -}

                4.17 pdfbox:number-of-pages

                Arities: #1

                Summary
                +}

                4.16 pdfbox:number-of-pages

                Arities: #1

                Summary
                Number of pages in PDF
                Signatures
                pdfbox:number-of-pages ( $pdf as item() ) as xs:integer
                Parameters
                • pdf as item()
                Return
                • xs:integer
                Referenced by 2 functions from 1 modules
                References 1 functions from 1 modules
                • {java:org.apache.pdfbox.pdmodel.PDDocument}getNumberOfPages#1
                Source ( 4 lines)
                function pdfbox:number-of-pages($pdf as item())
                 as xs:integer{
                   PDDocument:getNumberOfPages($pdf)
                -}

                4.18 pdfbox:open

                Arities: #1#2

                Summary
                -open pdf using fetch:binary, returns pdf object
                Signatures
                pdfbox:open +}

                4.17 pdfbox:open

                Arities: #1#2

                Summary
                +open pdf from file/url/binary, opts may have password , returns pdf object +
                Signatures
                pdfbox:open ( $pdfsrc as item() ) as item()
                pdfbox:open ( @@ -276,12 +259,12 @@ as item(){ else $pdfsrc return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) } -}

                4.19 pdfbox:outline

                Arities: #1#2P

                Summary
                +}

                4.18 pdfbox:outline

                Arities: #1#2P

                Summary
                Return outline for $pdf as map()*
                Signatures
                pdfbox:outline ( $pdf as item() ) as map(*)*
                pdfbox:outline ( - $pdf as item(), $outlineItem as item()? ) as map(*)*
                Parameters
                • pdf as item()
                • outlineItem as item()?
                Return
                • map(*) *
                Referenced by 3 functions from 1 modules
                References 6 functions from 5 modules
                • {http://www.w3.org/2005/xpath-functions/map}get#2
                • {http://www.w3.org/2005/xpath-functions}exists#1
                • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
                • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getFirstChild#1
                • pdfbox:outline#2
                • pdfbox:outline_#2
                Annotations (1)
                %private()
                Source ( 16 lines)
                function pdfbox:outline($pdf as item())
                +			$pdf as item(), $outlineItem as item()? ) as map(*)*
                Parameters
                Return
                Referenced by 3 functions from 1 modules
                References 6 functions from 5 modules
                Annotations (1)
                %private()
                Source ( 16 lines)
                function pdfbox:outline($pdf as item())
                 as map(*)*{
                   (# db:wrapjava some #) {
                   let $outline:=
                @@ -295,28 +278,33 @@ as map(*)*{
                 as map(*)*{
                   let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem)
                   return map:get($find,"list")
                -}

                4.20 pdfbox:outline-xml

                Arities: #1

                Summary
                +}

                4.19 pdfbox:outline-xml

                Arities: #1

                Summary
                PDF outline in xml format
                Signatures
                pdfbox:outline-xml ( - $pdf as item() ) as element(outline)?
                Parameters
                • pdf as item()
                Return
                • element(outline) ?
                Referenced by 1 functions from 1 modules
                References 3 functions from 2 modules
                Source ( 7 lines)
                function pdfbox:outline-xml($pdf as item())
                +			$pdf as item() ) as element(outline)?
                Parameters
                Return
                Referenced by 1 functions from 1 modules
                References 3 functions from 2 modules
                Source ( 7 lines)
                function pdfbox:outline-xml($pdf as item())
                 as element(outline)?{
                  let $outline:=pdfbox:outline($pdf)
                   return if(exists($outline))
                          then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline>
                          else ()
                -}

                4.21 pdfbox:outline_

                Arities: #2P

                Summary
                +}

                4.20 pdfbox:outline_

                Arities: #2P

                Summary
                outline helper. BaseX bug 10.7? error if inlined in outline
                Signatures
                pdfbox:outline_ ( - $pdf as item(), $outlineItem as item()? ) as map(*)
                Parameters
                • pdf as item()
                • outlineItem as item()?
                Return
                • map(*)
                Referenced by 1 functions from 1 modules
                References 8 functions from 4 modules
                • {http://www.w3.org/2005/xpath-functions/map}entry#2
                • {http://www.w3.org/2005/xpath-functions/map}merge#1
                • {http://www.w3.org/2005/xpath-functions}empty#1
                • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getFirstChild#1
                • {java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem}getNextSibling#1
                • pdfbox:bookmark#2
                • pdfbox:do-until#3
                • pdfbox:outline#2
                Annotations (1)
                %private()
                Source ( 20 lines)
                function pdfbox:outline_($pdf as item(),$outlineItem as item()?)
                +			$pdf as item(), $outlineItem as item()? ) as map(*)
                Parameters
                Return
                Referenced by 1 functions from 1 modules
                References 10 functions from 4 modules
                Annotations (1)
                %private()
                Source ( 25 lines)
                function pdfbox:outline_($pdf as item(),$outlineItem as item()?)
                 as map(*){
                   pdfbox:do-until(
                     
                      map{"list":(),"this":$outlineItem},
                 
                      function($input,$pos ) { 
                -        let $bk:= pdfbox:bookmark($input?this,$pdf)
                -        let $bk:= if($bk?hasChildren)
                -                  then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this))
                +        let $bookmark:=$input?this
                +        let $bk:=map{ 
                +              "index":  PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf),
                +              "title":  (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}
                +              }
                +
                +        let $bk:= if(PDOutlineItem:hasChildren($bookmark))
                +                  then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($bookmark))
                                         return map:merge(($bk,map:entry("children",$kids)))
                                   else $bk 
                         return map{
                @@ -326,14 +314,14 @@ as map(*){
                 
                      function($output,$pos) { empty($output?this) }                      
                   )
                -}

                4.22 pdfbox:page-labels

                Arities: #1

                Summary
                +}

                4.21 pdfbox:page-labels

                Arities: #1

                Summary
                get pagelabels exist
                Signatures
                pdfbox:page-labels ( $pdf )
                Parameters
                • pdf as 
                Return
                Referenced by 0 functions from 0 modules
                  References 1 functions from 1 modules
                  • {java:org.apache.pdfbox.pdmodel.PDDocument}getDocumentCatalog#1
                  Source ( 5 lines)
                  function pdfbox:page-labels($pdf)
                   {
                     PDDocument:getDocumentCatalog($pdf)
                     =>PDDocumentCatalog:getPageLabels()
                  -}

                  4.23 pdfbox:page-media-box

                  Arities: #2

                  Summary
                  +}

                  4.22 pdfbox:page-media-box

                  Arities: #2

                  Summary
                  Return size of $pageNo (zero based)
                  Signatures
                  pdfbox:page-media-box ( @@ -342,7 +330,7 @@ as xs:string{ PDDocument:getPage($pdf, $pageNo) =>PDPage:getMediaBox() =>PDRectangle:toString() -}

                  4.24 pdfbox:page-render

                  Arities: #3

                  Summary
                  +}

                  4.23 pdfbox:page-render

                  Arities: #3

                  Summary
                  Pdf page as image (zero is cover) options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi??
                  Signatures
                  pdfbox:page-render ( @@ -356,7 +344,7 @@ as xs:base64Binary{ return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() -}

                  4.25 pdfbox:page-text

                  Arities: #2

                  Summary
                  +}

                  4.24 pdfbox:page-text

                  Arities: #2

                  Summary
                  return text on $pageNo
                  Signatures
                  pdfbox:page-text ( $pdf as item(), $pageNo as xs:integer ) as xs:string
                  Parameters
                  • pdf as item()
                  • pageNo as xs:integer
                  Return
                  • xs:string
                  Referenced by 0 functions from 0 modules
                    References 2 functions from 1 modules
                    • {java:org.apache.pdfbox.text.PDFTextStripper}getText#2
                    • {java:org.apache.pdfbox.text.PDFTextStripper}new#0
                    Source ( 9 lines)
                    function pdfbox:page-text($pdf as item(), $pageNo as xs:integer)
                    @@ -367,16 +355,16 @@ as xs:string{
                              => PDFTextStripper:setEndPage($pageNo)
                            }
                       return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
                    -}

                    4.26 pdfbox:pdf-save

                    Arities: #2

                    Summary
                    -Save pdf $pdf to filesystem at $savepath , returns $savepath
                    Signatures
                    pdfbox:pdf-save +}

                    4.25 pdfbox:pdf-save

                    Arities: #2

                    Summary
                    +Save pdf $pdf to filesystem at $savepath , returns $savepath
                    Signatures
                    pdfbox:pdf-save ( $pdf as item(), $savepath as xs:string ) as xs:string
                    Parameters
                    • pdf as item()
                    • savepath as xs:string
                    Return
                    • xs:string
                    Referenced by 0 functions from 0 modules
                      References 2 functions from 2 modules
                      • {java:java.io.File}new#1
                      • {java:org.apache.pdfbox.pdmodel.PDDocument}save#2
                      Source ( 4 lines)
                      function pdfbox:pdf-save($pdf as item(),$savepath as xs:string)
                       as xs:string{
                          PDDocument:save($pdf, File:new($savepath)),$savepath
                      -}

                      4.27 pdfbox:property

                      Arities: #2

                      Summary
                      +}

                      4.26 pdfbox:property

                      Arities: #2

                      Summary
                      Return the value of $property for $pdf
                      Signatures
                      pdfbox:property ( - $pdf as item(), $property as xs:string ) as item()*
                      Parameters
                      • pdf as item()
                      • property as xs:string
                      Return
                      • item() *
                      Referenced by 1 functions from 1 modules
                      References 5 functions from 2 modules
                      • {http://www.w3.org/2001/XMLSchema}QName#1
                      • {http://www.w3.org/2005/xpath-functions}concat#3
                      • {http://www.w3.org/2005/xpath-functions}error#2
                      • {http://www.w3.org/2005/xpath-functions}exists#1
                      • {http://www.w3.org/2005/xpath-functions}fold-left#3
                      Source ( 9 lines)
                      function pdfbox:property($pdf as item(),$property as xs:string)
                      +			$pdf as item(), $property as xs:string ) as item()*
                      Parameters
                      Return
                      Referenced by 1 functions from 1 modules
                      References 5 functions from 2 modules
                      Source ( 9 lines)
                      function pdfbox:property($pdf as item(),$property as xs:string)
                       as item()*{
                         let $fns:= $pdfbox:property-map($property)
                         return if(exists($fns))
                      @@ -384,13 +372,13 @@ as item()*{
                                               $pdf, 
                                               function($result,$this as function(*)){$result!$this(.)})
                                else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined."))
                      -}

                      4.28 pdfbox:property-names

                      Arities: #0

                      Summary
                      +}

                      4.27 pdfbox:property-names

                      Arities: #0

                      Summary
                      Defined property names, sorted
                      Signatures
                      pdfbox:property-names ( - ) as xs:string*
                      Return
                      • xs:string *
                      Referenced by 1 functions from 1 modules
                      Source ( 4 lines)
                      function pdfbox:property-names() 
                      +			) as xs:string*
                      Return
                      Referenced by 1 functions from 1 modules
                      Source ( 4 lines)
                      function pdfbox:property-names() 
                       as xs:string*{
                         $pdfbox:property-map=>map:keys()=>sort()
                      -}

                      4.29 pdfbox:read-stream

                      Arities: #2P

                      Summary
                      +}

                      4.28 pdfbox:read-stream

                      Arities: #2P

                      Summary
                      read next block from XMP stream
                      Signatures
                      pdfbox:read-stream ( $is, $read as xs:string ) as map(*)
                      Parameters
                      • is as 
                      • read as xs:string
                      Return
                      • map(*)
                      Referenced by 1 functions from 1 modules
                      References 6 functions from 5 modules
                      • {http://basex.org/modules/convert}integers-to-base64#1
                      • {http://www.w3.org/2001/XMLSchema}byte#1
                      • {http://www.w3.org/2001/XMLSchema}int#1
                      • {http://www.w3.org/2005/xpath-functions}subsequence#3
                      • {java:java.util.Arrays}copyOf#2
                      • {java:org.apache.pdfbox.cos.COSInputStream}read#4
                      Annotations (1)
                      %private()
                      Source ( 8 lines)
                      function pdfbox:read-stream($is,$read as xs:string)
                      @@ -400,13 +388,13 @@ as map(*){
                         let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen))
                         let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string()
                         return map{"n":$n, "data": $read || $data}
                      -}

                      4.30 pdfbox:report

                      Arities: #1#2

                      Summary
                      -summary CSV style info for all properties for $pdfpaths +}

                      4.29 pdfbox:report

                      Arities: #1#2

                      Summary
                      +summary CSV style info for named $properties for PDFs in $pdfpaths
                      Signatures
                      pdfbox:report ( $pdfpaths as xs:string* ) as map(*)
                      pdfbox:report ( - $pdfpaths as item()*, $properties as xs:string* ) as map(*)
                      Parameters
                      • pdfpaths as item()*
                      • properties as xs:string*
                      Return
                      • map(*)
                      Tags
                      Referenced by 1 functions from 1 modules
                      References 8 functions from 3 modules
                      Source ( 28 lines)
                      function pdfbox:report($pdfpaths as xs:string*)
                      +			$pdfpaths as item()*, $properties as xs:string* ) as map(*)
                      Parameters
                      Return
                      See also
                      Referenced by 1 functions from 1 modules
                      References 8 functions from 3 modules
                      Source ( 28 lines)
                      function pdfbox:report($pdfpaths as xs:string*)
                       as map(*){
                        pdfbox:report($pdfpaths,pdfbox:property-names())
                       }
                      function pdfbox:report($pdfpaths as item()*, $properties as xs:string*)
                      @@ -432,14 +420,14 @@ as map(*){
                                        }
                                      
                         }
                      -}

                      4.31 pdfbox:report-save

                      Arities: #2

                      Summary
                      +}

                      4.30 pdfbox:report-save

                      Arities: #2

                      Summary
                      Convenience function to save report() data to file
                      Signatures
                      pdfbox:report-save ( - $data as map(*), $dest as xs:string ) as empty-sequence
                      Parameters
                      • data as map(*)
                      • dest as xs:string
                      Return
                      • empty-sequence
                      Referenced by 0 functions from 0 modules
                        References 2 functions from 2 modules
                        • {http://basex.org/modules/csv}serialize#2
                        • {http://expath.org/ns/file}write-text#2
                        Source ( 5 lines)
                        function pdfbox:report-save($data as map(*),$dest as xs:string)
                        +			$data as map(*), $dest as xs:string ) as empty-sequence()
                        Parameters
                        Return
                        Referenced by 0 functions from 0 modules
                        References 2 functions from 2 modules
                        Source ( 5 lines)
                        function pdfbox:report-save($data as map(*),$dest as xs:string)
                         as empty-sequence(){
                           let $opts := map {  "format":"xquery", "header":"yes", "separator" : "," }
                           return file:write-text($dest,csv:serialize($data,$opts))
                        -}

                        4.32 pdfbox:specification

                        Arities: #1

                        Summary
                        +}

                        4.31 pdfbox:specification

                        Arities: #1

                        Summary
                        The version of the PDF specification used by $pdf e.g "1.4" returned as string to avoid float rounding issues
                        Signatures
                        pdfbox:specification @@ -447,19 +435,19 @@ returned as string to avoid float rounding issues $pdf as item() ) as xs:string
                        Parameters
                        • pdf as item()
                        Return
                        • xs:string
                        Referenced by 0 functions from 0 modules
                          References 1 functions from 1 modules
                          • {java:org.apache.pdfbox.pdmodel.PDDocument}getVersion#1
                          Source ( 4 lines)
                          function pdfbox:specification($pdf as item())
                           as xs:string{
                            PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string()
                          -}

                          4.33 pdfbox:version

                          Arities: #0

                          Summary
                          +}

                          4.32 pdfbox:version

                          Arities: #0

                          Summary
                          Version of Apache Pdfbox in use e.g. "3.0.4"
                          Signatures
                          pdfbox:version ( ) as xs:string
                          Return
                          • xs:string
                          Referenced by 0 functions from 0 modules
                            References 1 functions from 1 modules
                            • {java:org.apache.pdfbox.util.Version}getVersion#0
                            Source ( 4 lines)
                            function pdfbox:version()
                             as xs:string{
                               Q{java:org.apache.pdfbox.util.Version}getVersion()
                            -}

                            4.34 pdfbox:with-pdf

                            Arities: #2

                            Summary
                            +}

                            4.33 pdfbox:with-pdf

                            Arities: #2

                            Summary
                            "With-document" pattern: open pdf,apply $fn function, close pdf creates a local pdfobject and ensures it is closed after use e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
                            Signatures
                            pdfbox:with-pdf ( - $src as xs:string, $fn as function(item())as item()* ) as item()*
                            Parameters
                            • src as xs:string
                            • fn as function(item())as item()*
                            Return
                            • item() *
                            Referenced by 0 functions from 0 modules
                              References 3 functions from 2 modules
                              Source ( 11 lines)
                              function pdfbox:with-pdf($src as xs:string,
                              +			$src as xs:string, $fn as function(item())as item()* ) as item()*
                              Parameters
                              Return
                              Referenced by 0 functions from 0 modules
                              References 3 functions from 2 modules
                              Source ( 11 lines)
                              function pdfbox:with-pdf($src as xs:string,
                                                               $fn as function(item())as item()*)
                               as item()*{
                                let $pdf:=pdfbox:open($src)
                              @@ -469,7 +457,7 @@ as item()*{
                                           pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description)
                                       }
                               
                              -}

                              Namespaces

                              The following namespaces are defined:

                              Prefix -Uri -
                              arrayhttp://www.w3.org/2005/xpath-functions/array
                              converthttp://basex.org/modules/convert
                              COSInputStreamjava:org.apache.pdfbox.cos.COSInputStream
                              csvhttp://basex.org/modules/csv
                              dbhttp://basex.org/modules/db
                              errhttp://www.w3.org/2005/xqt-errors
                              fetchhttp://basex.org/modules/fetch
                              Filejava:java.io.File
                              filehttp://expath.org/ns/file
                              fnhttp://www.w3.org/2005/xpath-functions
                              Loaderjava:org.apache.pdfbox.Loader
                              maphttp://www.w3.org/2005/xpath-functions/map
                              PageExtractorjava:org.apache.pdfbox.multipdf.PageExtractor
                              PDDocumentjava:org.apache.pdfbox.pdmodel.PDDocument
                              PDDocumentCatalogjava:org.apache.pdfbox.pdmodel.PDDocumentCatalog
                              PDDocumentInformationjava:org.apache.pdfbox.pdmodel.PDDocumentInformation
                              PDDocumentOutlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline
                              pdfboxorg.expkg_zone58.Pdfbox3
                              PDFRendererjava:org.apache.pdfbox.rendering.PDFRenderer
                              PDFTextStripperjava:org.apache.pdfbox.text.PDFTextStripper
                              PDMetadatajava:org.apache.pdfbox.pdmodel.common.PDMetadata
                              PDOutlineItemjava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem
                              PDPagejava:org.apache.pdfbox.pdmodel.PDPage
                              PDPageLabelRangejava:org.apache.pdfbox.pdmodel.common.PDPageLabelRange
                              PDPageLabelsjava:org.apache.pdfbox.pdmodel.common.PDPageLabels
                              PDPageTreejava:org.apache.pdfbox.pdmodel.PDPageTree
                              PDRectangleorg.apache.pdfbox.pdmodel.common.PDRectangle
                              RandomAccessReadBufferjava:org.apache.pdfbox.io.RandomAccessReadBuffer
                              RandomAccessReadBufferedFilejava:org.apache.pdfbox.io.RandomAccessReadBufferedFile
                              rdfhttp://www.w3.org/1999/02/22-rdf-syntax-ns#
                              xshttp://www.w3.org/2001/XMLSchema

                              6 RestXQ

                              None

                              Source Code

                              xquery version '3.1';
                              +}

                              Namespaces

                              The following namespaces are defined:

                              Prefix -Type -Uri -
                              arrayxpathhttp://www.w3.org/2005/xpath-functions/array
                              convertbasexhttp://basex.org/modules/convert
                              COSInputStreamjavajava:org.apache.pdfbox.cos.COSInputStream
                              csvbasexhttp://basex.org/modules/csv
                              dbbasexhttp://basex.org/modules/db
                              errw3chttp://www.w3.org/2005/xqt-errors
                              fetchbasexhttp://basex.org/modules/fetch
                              Filejavajava:java.io.File
                              file-http://expath.org/ns/file
                              fnxpathhttp://www.w3.org/2005/xpath-functions
                              Loaderjavajava:org.apache.pdfbox.Loader
                              mapxpathhttp://www.w3.org/2005/xpath-functions/map
                              PageExtractorjavajava:org.apache.pdfbox.multipdf.PageExtractor
                              PDDocumentjavajava:org.apache.pdfbox.pdmodel.PDDocument
                              PDDocumentCatalogjavajava:org.apache.pdfbox.pdmodel.PDDocumentCatalog
                              PDDocumentInformationjavajava:org.apache.pdfbox.pdmodel.PDDocumentInformation
                              PDDocumentOutlinejavajava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline
                              pdfbox-org.expkg_zone58.Pdfbox3
                              PDFRendererjavajava:org.apache.pdfbox.rendering.PDFRenderer
                              PDFTextStripperjavajava:org.apache.pdfbox.text.PDFTextStripper
                              PDMetadatajavajava:org.apache.pdfbox.pdmodel.common.PDMetadata
                              PDOutlineItemjavajava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem
                              PDPagejavajava:org.apache.pdfbox.pdmodel.PDPage
                              PDPageLabelRangejavajava:org.apache.pdfbox.pdmodel.common.PDPageLabelRange
                              PDPageLabelsjavajava:org.apache.pdfbox.pdmodel.common.PDPageLabels
                              PDPageTreejavajava:org.apache.pdfbox.pdmodel.PDPageTree
                              PDRectanglejavajava:org.apache.pdfbox.pdmodel.common.PDRectangle
                              RandomAccessReadBufferjavajava:org.apache.pdfbox.io.RandomAccessReadBuffer
                              RandomAccessReadBufferedFilejavajava:org.apache.pdfbox.io.RandomAccessReadBufferedFile
                              rdfw3chttp://www.w3.org/1999/02/22-rdf-syntax-ns#
                              xsw3chttp://www.w3.org/2001/XMLSchema

                              6 RestXQ

                              None

                              Source Code

                              xquery version '3.1';
                               (:~ 
                               A BaseX 10.7+ interface to pdfbox3 https://pdfbox.apache.org/ , 
                               requires pdfbox jars on classpath, in lib/custom or xar
                              @@ -506,7 +494,7 @@ declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
                               
                               declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer";
                               declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
                              -declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle";
                              +declare namespace PDRectangle="java:org.apache.pdfbox.pdmodel.common.PDRectangle";
                               
                               declare namespace File ="java:java.io.File";
                               
                              @@ -529,11 +517,6 @@ as item()*{
                               };
                               
                               
                              -(:~ open pdf using fetch:binary, returns pdf object :)
                              -declare function pdfbox:open($pdfsrc as item())
                              -as item(){
                              -pdfbox:open($pdfsrc, map{})
                              -};
                               
                               (:~ open pdf from file/url/binary, opts may have password , returns pdf object 
                               @param $pdfsrc a fetchable url or filepath, or xs:base64Binary item
                              @@ -558,6 +541,13 @@ as item(){
                               }
                               };
                               
                              +(:~ open pdf from a location, returns pdf object :)
                              +declare function pdfbox:open($pdfsrc as item())
                              +as item(){
                              +pdfbox:open($pdfsrc, map{})
                              +};
                              +
                              +
                               (:~ The version of the PDF specification used by $pdf  e.g "1.4"
                               returned as string to avoid float rounding issues
                                :)
                              @@ -566,13 +556,13 @@ as xs:string{
                                PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string()
                               };
                               
                              -(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :)
                              +(:~ Save pdf <code>$pdf</code> to filesystem at <code>$savepath</code> , returns $savepath :)
                               declare function pdfbox:pdf-save($pdf as item(),$savepath as xs:string)
                               as xs:string{
                                  PDDocument:save($pdf, File:new($savepath)),$savepath
                               };
                               
                              -(:~ Create binary representation of $pdf object as xs:base64Binary :)
                              +(:~ Create binary representation (xs:base64Binary) of <code>$pdf</code> object  :)
                               declare function pdfbox:binary($pdf as item())
                               as xs:base64Binary{
                                  let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
                              @@ -669,12 +659,6 @@ as item()*{
                                        else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined."))
                               };
                               
                              -(:~ summary CSV style info for all properties for $pdfpaths 
                              -:)
                              -declare function pdfbox:report($pdfpaths as xs:string*)
                              -as map(*){
                              - pdfbox:report($pdfpaths,pdfbox:property-names())
                              -};
                               
                               (:~ summary CSV style info for named $properties for PDFs in $pdfpaths 
                               @see https://docs.basex.org/main/CSV_Functions#xquery
                              @@ -704,6 +688,13 @@ as map(*){
                                 }
                               };
                               
                              +(:~ summary CSV style info for all properties for $pdfpaths 
                              +:)
                              +declare function pdfbox:report($pdfpaths as xs:string*)
                              +as map(*){
                              + pdfbox:report($pdfpaths,pdfbox:property-names())
                              +};
                              +
                               (:~ Convenience function to save report() data to file :)
                               declare function pdfbox:report-save($data as map(*),$dest as xs:string)
                               as empty-sequence(){
                              @@ -777,9 +768,14 @@ as map(*){
                                    map{"list":(),"this":$outlineItem},
                               
                                    function($input,$pos ) { 
                              -        let $bk:= pdfbox:bookmark($input?this,$pdf)
                              -        let $bk:= if($bk?hasChildren)
                              -                  then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this))
                              +        let $bookmark:=$input?this
                              +        let $bk:=map{ 
                              +              "index":  PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf),
                              +              "title":  (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}
                              +              }
                              +
                              +        let $bk:= if(PDOutlineItem:hasChildren($bookmark))
                              +                  then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($bookmark))
                                                       return map:merge(($bk,map:entry("children",$kids)))
                                                 else $bk 
                                       return map{
                              @@ -810,21 +806,6 @@ as element(bookmark)*
                                 </bookmark>
                               };
                               
                              -(:~ Return bookmark info for $bookmark
                              -@return map{index:..,title:..,hasChildren:..}
                              -:)
                              -declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item())
                              -as map(*)
                              -{
                              - map{ 
                              -  "index":  PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf),
                              -  "title":  (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}
                              -  (:=>translate("�",""), :),
                              -  "hasChildren": PDOutlineItem:hasChildren($bookmark)
                              -  }
                              -};
                              -
                              -
                               (:~ pageIndex of $page in $pdf :)
                               declare function pdfbox:find-page(
                                  $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
                              @@ -993,4 +974,4 @@ declare %private function pdfbox:do-until(
                               };
                               
                              \ No newline at end of file +   on Monday, 9th June 2025

                              \ No newline at end of file diff --git a/docs/xqdoc/modules/F000001/xqdoc.xml b/docs/xqdoc/modules/F000001/xqdoc.xml index 1e807fb..f881bc3 100644 --- a/docs/xqdoc/modules/F000001/xqdoc.xml +++ b/docs/xqdoc/modules/F000001/xqdoc.xml @@ -1,4 +1,4 @@ -2025-06-04T16:17:13.527+01:001.1org.expkg_zone58.Pdfbox3pdfbox +2025-06-09T21:09:05.833+01:001.1org.expkg_zone58.Pdfbox3pdfbox A BaseX 10.7+ interface to pdfbox3 https://pdfbox.apache.org/ , requires pdfbox jars on classpath, in lib/custom or xar @@ -40,7 +40,7 @@ declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; -declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle"; +declare namespace PDRectangle="java:org.apache.pdfbox.pdmodel.common.PDRectangle"; declare namespace File ="java:java.io.File"; @@ -63,11 +63,6 @@ as item()*{ }; -(:~ open pdf using fetch:binary, returns pdf object :) -declare function pdfbox:open($pdfsrc as item()) -as item(){ -pdfbox:open($pdfsrc, map{}) -}; (:~ open pdf from file/url/binary, opts may have password , returns pdf object @param $pdfsrc a fetchable url or filepath, or xs:base64Binary item @@ -92,6 +87,13 @@ as item(){ } }; +(:~ open pdf from a location, returns pdf object :) +declare function pdfbox:open($pdfsrc as item()) +as item(){ +pdfbox:open($pdfsrc, map{}) +}; + + (:~ The version of the PDF specification used by $pdf e.g "1.4" returned as string to avoid float rounding issues :) @@ -100,13 +102,13 @@ as xs:string{ PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() }; -(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) +(:~ Save pdf <code>$pdf</code> to filesystem at <code>$savepath</code> , returns $savepath :) declare function pdfbox:pdf-save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath }; -(:~ Create binary representation of $pdf object as xs:base64Binary :) +(:~ Create binary representation (xs:base64Binary) of <code>$pdf</code> object :) declare function pdfbox:binary($pdf as item()) as xs:base64Binary{ let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() @@ -203,12 +205,6 @@ as item()*{ else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }; -(:~ summary CSV style info for all properties for $pdfpaths -:) -declare function pdfbox:report($pdfpaths as xs:string*) -as map(*){ - pdfbox:report($pdfpaths,pdfbox:property-names()) -}; (:~ summary CSV style info for named $properties for PDFs in $pdfpaths @see https://docs.basex.org/main/CSV_Functions#xquery @@ -238,6 +234,13 @@ as map(*){ } }; +(:~ summary CSV style info for all properties for $pdfpaths +:) +declare function pdfbox:report($pdfpaths as xs:string*) +as map(*){ + pdfbox:report($pdfpaths,pdfbox:property-names()) +}; + (:~ Convenience function to save report() data to file :) declare function pdfbox:report-save($data as map(*),$dest as xs:string) as empty-sequence(){ @@ -311,9 +314,14 @@ as map(*){ map{"list":(),"this":$outlineItem}, function($input,$pos ) { - let $bk:= pdfbox:bookmark($input?this,$pdf) - let $bk:= if($bk?hasChildren) - then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) + let $bookmark:=$input?this + let $bk:=map{ + "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), + "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} + } + + let $bk:= if(PDOutlineItem:hasChildren($bookmark)) + then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($bookmark)) return map:merge(($bk,map:entry("children",$kids))) else $bk return map{ @@ -344,21 +352,6 @@ as element(bookmark)* </bookmark> }; -(:~ Return bookmark info for $bookmark -@return map{index:..,title:..,hasChildren:..} -:) -declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) -as map(*) -{ - map{ - "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), - "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} - (:=>translate("�",""), :), - "hasChildren": PDOutlineItem:hasChildren($bookmark) - } -}; - - (:~ pageIndex of $page in $pdf :) declare function pdfbox:find-page( $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), @@ -525,7 +518,7 @@ declare %private function pdfbox:do-until( else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") }; -pdfbox:property-map +pdfbox:property-map Defines a map from property names to evaluation method. Keys are property names, values are sequences of functions to get property value starting from a $pdf object. @@ -569,7 +562,7 @@ values are sequences of functions to get property value starting from a $pdf obj "With-document" pattern: open pdf,apply $fn function, close pdf creates a local pdfobject and ensures it is closed after use e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) -pdfbox:with-pdffunction pdfbox:with-pdf ( $src as xs:string, $fn as function(item())as item()* ) as item()* { let $pdf:=pdfbox:open($src) return try{ $fn($pdf),pdfbox:close($pdf) } catch *{ pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) } }srcxs:stringfnfunction(item())as item()*item()org.expkg_zone58.Pdfbox3openorg.expkg_zone58.Pdfbox3closeorg.expkg_zone58.Pdfbox3closehttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2005/xqt-errorscodehttp://www.w3.org/2005/xqt-errorsdescriptionfunction pdfbox:with-pdf($src as xs:string, +pdfbox:with-pdffunction pdfbox:with-pdf ( $src as xs:string, $fn as function(item())as item()* ) as item()*srcxs:stringfnfunction(item())as item()*item()org.expkg_zone58.Pdfbox3openorg.expkg_zone58.Pdfbox3closeorg.expkg_zone58.Pdfbox3closehttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2005/xqt-errorscodehttp://www.w3.org/2005/xqt-errorsdescriptionfunction pdfbox:with-pdf($src as xs:string, $fn as function(item())as item()*) as item()*{ let $pdf:=pdfbox:open($src) @@ -579,13 +572,9 @@ as item()*{ pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description) } -} -open pdf using fetch:binary, returns pdf objectpdfbox:openfunction pdfbox:open ( $pdfsrc as item() ) as item() { pdfbox:open($pdfsrc, map{}) }pdfsrcitem()item()org.expkg_zone58.Pdfbox3openfunction pdfbox:open($pdfsrc as item()) -as item(){ -pdfbox:open($pdfsrc, map{}) } open pdf from file/url/binary, opts may have password , returns pdf object -$pdfsrc a fetchable url or filepath, or xs:base64Binary item$opts options options include map {"password":}fetch:binary for https will use a lot of memory herepdfbox:openfunction pdfbox:open ( $pdfsrc as item(), $opts as map(*) ) as item() { try{ if($pdfsrc instance of xs:base64Binary) then Loader:loadPDF( $pdfsrc,string($opts?password)) else if(starts-with($pdfsrc,"http")) then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) } catch *{ let $loc:=if($pdfsrc instance of xs:base64Binary) then "xs:base64Binary" else $pdfsrc return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) } }pdfsrcitem()optsmap(*)item()java:org.apache.pdfbox.LoaderloadPDFhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionsstarts-withjava:org.apache.pdfbox.LoaderloadPDFhttp://basex.org/modules/fetchbinaryhttp://www.w3.org/2005/xpath-functionsstringjava:org.apache.pdfbox.LoaderloadPDFjava:org.apache.pdfbox.io.RandomAccessReadBufferedFilenewhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xqt-errorsdescriptionfunction pdfbox:open($pdfsrc as item(), $opts as map(*)) +$pdfsrc a fetchable url or filepath, or xs:base64Binary item$opts options options include map {"password":}fetch:binary for https will use a lot of memory herepdfbox:openfunction pdfbox:open ( $pdfsrc as item(), $opts as map(*) ) as item()pdfsrcitem()optsmap(*)item()java:org.apache.pdfbox.LoaderloadPDFhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionsstarts-withjava:org.apache.pdfbox.LoaderloadPDFhttp://basex.org/modules/fetchbinaryhttp://www.w3.org/2005/xpath-functionsstringjava:org.apache.pdfbox.LoaderloadPDFjava:org.apache.pdfbox.io.RandomAccessReadBufferedFilenewhttp://www.w3.org/2005/xpath-functionsstringhttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xqt-errorsdescriptionfunction pdfbox:open($pdfsrc as item(), $opts as map(*)) as item(){ try{ @@ -602,35 +591,39 @@ as item(){ return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) } } +open pdf from a location, returns pdf objectpdfbox:openfunction pdfbox:open ( $pdfsrc as item() ) as item()pdfsrcitem()item()org.expkg_zone58.Pdfbox3openfunction pdfbox:open($pdfsrc as item()) +as item(){ +pdfbox:open($pdfsrc, map{}) +} The version of the PDF specification used by $pdf e.g "1.4" returned as string to avoid float rounding issues -pdfbox:specificationfunction pdfbox:specification ( $pdf as item() ) as xs:string { PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() }pdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetVersionfunction pdfbox:specification($pdf as item()) +pdfbox:specificationfunction pdfbox:specification ( $pdf as item() ) as xs:stringpdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetVersionfunction pdfbox:specification($pdf as item()) as xs:string{ PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() -} -Save pdf $pdf to filesystem at $savepath , returns $savepathpdfbox:pdf-savefunction pdfbox:pdf-save ( $pdf as item(),$savepath as xs:string ) as xs:string { PDDocument:save($pdf, File:new($savepath)),$savepath }pdfitem()savepathxs:stringxs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentsavejava:java.io.Filenewfunction pdfbox:pdf-save($pdf as item(),$savepath as xs:string) +} +Save pdf $pdf to filesystem at $savepath , returns $savepathpdfbox:pdf-savefunction pdfbox:pdf-save ( $pdf as item(),$savepath as xs:string ) as xs:stringpdfitem()savepathxs:stringxs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentsavejava:java.io.Filenewfunction pdfbox:pdf-save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath -} -Create binary representation of $pdf object as xs:base64Binarypdfbox:binaryfunction pdfbox:binary ( $pdf as item() ) as xs:base64Binary { let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() let $_:=PDDocument:save($pdf, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() }pdfitem()xs:base64Binaryjava:java.io.ByteArrayOutputStreamnewjava:org.apache.pdfbox.pdmodel.PDDocumentsavejava:java.io.ByteArrayOutputStreamtoByteArrayfunction pdfbox:binary($pdf as item()) +} +Create binary representation (xs:base64Binary) of $pdf objectpdfbox:binaryfunction pdfbox:binary ( $pdf as item() ) as xs:base64Binarypdfitem()xs:base64Binaryjava:java.io.ByteArrayOutputStreamnewjava:org.apache.pdfbox.pdmodel.PDDocumentsavejava:java.io.ByteArrayOutputStreamtoByteArrayfunction pdfbox:binary($pdf as item()) as xs:base64Binary{ let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() let $_:=PDDocument:save($pdf, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() } -Release any resources related to $pdfpdfbox:closefunction pdfbox:close ( $pdf as item() ) as empty-sequence() { (# db:wrapjava void #) { PDDocument:close($pdf) } }pdfitem()empty-sequencejava:org.apache.pdfbox.pdmodel.PDDocumentclosefunction pdfbox:close($pdf as item()) +Release any resources related to $pdfpdfbox:closefunction pdfbox:close ( $pdf as item() ) as empty-sequence()pdfitem()empty-sequence()java:org.apache.pdfbox.pdmodel.PDDocumentclosefunction pdfbox:close($pdf as item()) as empty-sequence(){ (# db:wrapjava void #) { PDDocument:close($pdf) } } -Number of pages in PDFpdfbox:number-of-pagesfunction pdfbox:number-of-pages ( $pdf as item() ) as xs:integer { PDDocument:getNumberOfPages($pdf) }pdfitem()xs:integerjava:org.apache.pdfbox.pdmodel.PDDocumentgetNumberOfPagesfunction pdfbox:number-of-pages($pdf as item()) +Number of pages in PDFpdfbox:number-of-pagesfunction pdfbox:number-of-pages ( $pdf as item() ) as xs:integerpdfitem()xs:integerjava:org.apache.pdfbox.pdmodel.PDDocumentgetNumberOfPagesfunction pdfbox:number-of-pages($pdf as item()) as xs:integer{ PDDocument:getNumberOfPages($pdf) } Pdf page as image (zero is cover) -options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi??pdfbox:page-renderfunction pdfbox:page-render ( $pdf as item(),$pageNo as xs:integer,$options as map(*) ) as xs:base64Binary { let $options := map:merge(($options,map{"format":"jpg","scale":1})) let $bufferedImage := PDFRenderer:new($pdf) =>PDFRenderer:renderImage($pageNo,$options?scale) let $bytes := Q{java:java.io.ByteArrayOutputStream}new() let $_ := Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) =>convert:integers-to-base64() }pdfitem()pageNoxs:integeroptionsmap(*)xs:base64Binaryhttp://www.w3.org/2005/xpath-functions/mapmergejava:org.apache.pdfbox.rendering.PDFRenderernewjava:java.io.ByteArrayOutputStreamnewjava:javax.imageio.ImageIOwritejava:java.io.ByteArrayOutputStreamtoByteArrayfunction pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) +options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi??pdfbox:page-renderfunction pdfbox:page-render ( $pdf as item(),$pageNo as xs:integer,$options as map(*) ) as xs:base64Binarypdfitem()pageNoxs:integeroptionsmap(*)xs:base64Binaryhttp://www.w3.org/2005/xpath-functions/mapmergejava:org.apache.pdfbox.rendering.PDFRenderernewjava:java.io.ByteArrayOutputStreamnewjava:javax.imageio.ImageIOwritejava:java.io.ByteArrayOutputStreamtoByteArrayfunction pdfbox:page-render($pdf as item(),$pageNo as xs:integer,$options as map(*)) as xs:base64Binary{ let $options := map:merge(($options,map{"format":"jpg","scale":1})) let $bufferedImage := PDFRenderer:new($pdf) @@ -641,11 +634,11 @@ as xs:base64Binary{ =>convert:integers-to-base64() } -Defined property names, sortedpdfbox:property-namesfunction pdfbox:property-names ( ) as xs:string* { $pdfbox:property-map=>map:keys()=>sort() }xs:stringorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:property-names() +Defined property names, sortedpdfbox:property-namesfunction pdfbox:property-names ( ) as xs:string*xs:stringorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:property-names() as xs:string*{ $pdfbox:property-map=>map:keys()=>sort() } -Return the value of $property for $pdfpdfbox:propertyfunction pdfbox:property ( $pdf as item(),$property as xs:string ) as item()* { let $fns:= $pdfbox:property-map($property) return if(exists($fns)) then fold-left($fns, $pdf, function($result,$this as function(*)){$result!$this(.)}) else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }pdfitem()propertyxs:stringitem()http://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xpath-functionsconcatorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:property($pdf as item(),$property as xs:string) +Return the value of $property for $pdfpdfbox:propertyfunction pdfbox:property ( $pdf as item(),$property as xs:string ) as item()*pdfitem()propertyxs:stringitem()http://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamehttp://www.w3.org/2005/xpath-functionsconcatorg.expkg_zone58.Pdfbox3property-mapfunction pdfbox:property($pdf as item(),$property as xs:string) as item()*{ let $fns:= $pdfbox:property-map($property) return if(exists($fns)) @@ -653,14 +646,9 @@ as item()*{ $pdf, function($result,$this as function(*)){$result!$this(.)}) else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) -} -summary CSV style info for all properties for $pdfpaths -pdfbox:reportfunction pdfbox:report ( $pdfpaths as xs:string* ) as map(*) { pdfbox:report($pdfpaths,pdfbox:property-names()) }pdfpathsxs:stringmap(*)org.expkg_zone58.Pdfbox3reportorg.expkg_zone58.Pdfbox3property-namesfunction pdfbox:report($pdfpaths as xs:string*) -as map(*){ - pdfbox:report($pdfpaths,pdfbox:property-names()) } summary CSV style info for named $properties for PDFs in $pdfpaths -https://docs.basex.org/main/CSV_Functions#xquerypdfbox:reportfunction pdfbox:report ( $pdfpaths as item()*, $properties as xs:string* ) as map(*) { map{"names": array{"path",$properties}, "records": for $path in $pdfpaths let $name:=if($path instance of xs:base64Binary) then "binary" else $path return try{ let $pdf:=pdfbox:open($path) return (fold-left($properties, array{$name}, function($result as array(*),$prop as xs:string){ array:append($result, string(pdfbox:property($pdf, $prop)))} ), pdfbox:close($pdf) ) } catch *{ fold-left($properties, array{$name}, function($result as array(*),$prop as xs:string){ array:append($result, "#ERROR")} ) } } }pdfpathsitem()propertiesxs:stringmap(*)org.expkg_zone58.Pdfbox3openhttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functions/arrayappendhttp://www.w3.org/2005/xpath-functionsstringorg.expkg_zone58.Pdfbox3propertyorg.expkg_zone58.Pdfbox3closehttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functions/arrayappendfunction pdfbox:report($pdfpaths as item()*, $properties as xs:string*) +https://docs.basex.org/main/CSV_Functions#xquerypdfbox:reportfunction pdfbox:report ( $pdfpaths as item()*, $properties as xs:string* ) as map(*)pdfpathsitem()propertiesxs:stringmap(*)org.expkg_zone58.Pdfbox3openhttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functions/arrayappendhttp://www.w3.org/2005/xpath-functionsstringorg.expkg_zone58.Pdfbox3propertyorg.expkg_zone58.Pdfbox3closehttp://www.w3.org/2005/xpath-functionsfold-lefthttp://www.w3.org/2005/xpath-functions/arrayappendfunction pdfbox:report($pdfpaths as item()*, $properties as xs:string*) as map(*){ map{"names": array{"path",$properties}, @@ -683,19 +671,24 @@ as map(*){ } } +} +summary CSV style info for all properties for $pdfpaths +pdfbox:reportfunction pdfbox:report ( $pdfpaths as xs:string* ) as map(*)pdfpathsxs:stringmap(*)org.expkg_zone58.Pdfbox3reportorg.expkg_zone58.Pdfbox3property-namesfunction pdfbox:report($pdfpaths as xs:string*) +as map(*){ + pdfbox:report($pdfpaths,pdfbox:property-names()) } -Convenience function to save report() data to filepdfbox:report-savefunction pdfbox:report-save ( $data as map(*),$dest as xs:string ) as empty-sequence() { let $opts := map { "format":"xquery", "header":"yes", "separator" : "," } return file:write-text($dest,csv:serialize($data,$opts)) }datamap(*)destxs:stringempty-sequencehttp://expath.org/ns/filewrite-texthttp://basex.org/modules/csvserializefunction pdfbox:report-save($data as map(*),$dest as xs:string) +Convenience function to save report() data to filepdfbox:report-savefunction pdfbox:report-save ( $data as map(*),$dest as xs:string ) as empty-sequence()datamap(*)destxs:stringempty-sequence()http://expath.org/ns/filewrite-texthttp://basex.org/modules/csvserializefunction pdfbox:report-save($data as map(*),$dest as xs:string) as empty-sequence(){ let $opts := map { "format":"xquery", "header":"yes", "separator" : "," } return file:write-text($dest,csv:serialize($data,$opts)) } -The number of outline items defined in $pdfpdfbox:number-of-bookmarksfunction pdfbox:number-of-bookmarks ( $pdf as item() ) as xs:integer { let $xml:=pdfbox:outline-xml($pdf) return count($xml//bookmark) }pdfitem()xs:integerorg.expkg_zone58.Pdfbox3outline-xmlhttp://www.w3.org/2005/xpath-functionscountfunction pdfbox:number-of-bookmarks($pdf as item()) +The number of outline items defined in $pdfpdfbox:number-of-bookmarksfunction pdfbox:number-of-bookmarks ( $pdf as item() ) as xs:integerpdfitem()xs:integerorg.expkg_zone58.Pdfbox3outline-xmlhttp://www.w3.org/2005/xpath-functionscountfunction pdfbox:number-of-bookmarks($pdf as item()) as xs:integer{ let $xml:=pdfbox:outline-xml($pdf) return count($xml//bookmark) } XMP metadata as "RDF" document -usually rdf:RDF root, but sometimes x:xmpmetapdfbox:metadatafunction pdfbox:metadata ( $pdf as item() ) as document-node(element(*))? { let $m:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getMetadata() return if(exists($m)) then let $is:=PDMetadata:exportXMPMetadata($m) return pdfbox:do-until( map{"n":0,"data":""}, function($input,$pos ) { pdfbox:read-stream($is,$input?data)}, function($output,$pos) { $output?n eq -1 } )?data=>parse-xml() else () }pdfitem()document-node(element(*))java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.common.PDMetadataexportXMPMetadataorg.expkg_zone58.Pdfbox3do-untilorg.expkg_zone58.Pdfbox3read-streamfunction pdfbox:metadata($pdf as item()) +usually rdf:RDF root, but sometimes x:xmpmetapdfbox:metadatafunction pdfbox:metadata ( $pdf as item() ) as document-node(element(*))?pdfitem()document-node(element(*))java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.common.PDMetadataexportXMPMetadataorg.expkg_zone58.Pdfbox3do-untilorg.expkg_zone58.Pdfbox3read-streamfunction pdfbox:metadata($pdf as item()) as document-node(element(*))? { let $m:=PDDocument:getDocumentCatalog($pdf) @@ -712,7 +705,7 @@ as document-node(element(*))? )?data=>parse-xml() else () } -read next block from XMP streampdfbox:read-streamfunction pdfbox:read-stream ( $is,$read as xs:string ) as map(*) { let $blen:=4096 let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) let $n:= COSInputStream:read($is,$buff,xs:int(0),xs:int($blen)) let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() return map{"n":$n, "data": $read || $data} }isreadxs:stringmap(*)java:java.util.ArrayscopyOfhttp://www.w3.org/2001/XMLSchemabytejava:org.apache.pdfbox.cos.COSInputStreamreadhttp://www.w3.org/2001/XMLSchemainthttp://www.w3.org/2001/XMLSchemainthttp://basex.org/modules/convertintegers-to-base64http://www.w3.org/2005/xpath-functionssubsequencefunction pdfbox:read-stream($is,$read as xs:string) +read next block from XMP streampdfbox:read-streamfunction pdfbox:read-stream ( $is,$read as xs:string ) as map(*)isreadxs:stringmap(*)java:java.util.ArrayscopyOfhttp://www.w3.org/2001/XMLSchemabytejava:org.apache.pdfbox.cos.COSInputStreamreadhttp://www.w3.org/2001/XMLSchemainthttp://www.w3.org/2001/XMLSchemainthttp://basex.org/modules/convertintegers-to-base64http://www.w3.org/2005/xpath-functionssubsequencefunction pdfbox:read-stream($is,$read as xs:string) as map(*){ let $blen:=4096 let $buff:=Q{java:java.util.Arrays}copyOf(array{xs:byte(0)},$blen) @@ -720,7 +713,7 @@ as map(*){ let $data:=convert:integers-to-base64(subsequence($buff,1,$n))=>convert:binary-to-string() return map{"n":$n, "data": $read || $data} } -Return outline for $pdf as map()*pdfbox:outlinefunction pdfbox:outline ( $pdf as item() ) as map(*)* { (# db:wrapjava some #) { let $outline:= PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getDocumentOutline() return if(exists($outline)) then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) } }pdfitem()map(*)java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsorg.expkg_zone58.Pdfbox3outlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetFirstChildfunction pdfbox:outline($pdf as item()) +Return outline for $pdf as map()*pdfbox:outlinefunction pdfbox:outline ( $pdf as item() ) as map(*)*pdfitem()map(*)java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsorg.expkg_zone58.Pdfbox3outlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetFirstChildfunction pdfbox:outline($pdf as item()) as map(*)*{ (# db:wrapjava some #) { let $outline:= @@ -731,21 +724,26 @@ as map(*)*{ then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline)) } } -return bookmark info for children of $outlineItem as seq of mapspdfbox:outlinefunction pdfbox:outline ( $pdf as item(),$outlineItem as item()? ) as map(*)* { let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) return map:get($find,"list") }pdfitem()outlineItemitem()map(*)org.expkg_zone58.Pdfbox3outline_http://www.w3.org/2005/xpath-functions/mapgetfunction pdfbox:outline($pdf as item(),$outlineItem as item()?) +return bookmark info for children of $outlineItem as seq of mapspdfbox:outlinefunction pdfbox:outline ( $pdf as item(),$outlineItem as item()? ) as map(*)*pdfitem()outlineItemitem()map(*)org.expkg_zone58.Pdfbox3outline_http://www.w3.org/2005/xpath-functions/mapgetfunction pdfbox:outline($pdf as item(),$outlineItem as item()?) as map(*)*{ let $find as map(*):=pdfbox:outline_($pdf ,$outlineItem) return map:get($find,"list") } -outline helper. BaseX bug 10.7? error if inlined in outlinepdfbox:outline_function pdfbox:outline_ ( $pdf as item(),$outlineItem as item()? ) as map(*) { pdfbox:do-until( map{"list":(),"this":$outlineItem}, function($input,$pos ) { let $bk:= pdfbox:bookmark($input?this,$pdf) let $bk:= if($bk?hasChildren) then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) return map:merge(($bk,map:entry("children",$kids))) else $bk return map{ "list": ($input?list, $bk), "this": PDOutlineItem:getNextSibling($input?this)} }, function($output,$pos) { empty($output?this) } ) }pdfitem()outlineItemitem()map(*)org.expkg_zone58.Pdfbox3do-untilorg.expkg_zone58.Pdfbox3bookmarkorg.expkg_zone58.Pdfbox3outlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetFirstChildhttp://www.w3.org/2005/xpath-functions/mapmergehttp://www.w3.org/2005/xpath-functions/mapentryjava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetNextSiblinghttp://www.w3.org/2005/xpath-functionsemptyfunction pdfbox:outline_($pdf as item(),$outlineItem as item()?) +outline helper. BaseX bug 10.7? error if inlined in outlinepdfbox:outline_function pdfbox:outline_ ( $pdf as item(),$outlineItem as item()? ) as map(*)pdfitem()outlineItemitem()map(*)org.expkg_zone58.Pdfbox3do-untiljava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemfindDestinationPagejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetTitlejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemhasChildrenorg.expkg_zone58.Pdfbox3outlinejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetFirstChildhttp://www.w3.org/2005/xpath-functions/mapmergehttp://www.w3.org/2005/xpath-functions/mapentryjava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetNextSiblinghttp://www.w3.org/2005/xpath-functionsemptyfunction pdfbox:outline_($pdf as item(),$outlineItem as item()?) as map(*){ pdfbox:do-until( map{"list":(),"this":$outlineItem}, function($input,$pos ) { - let $bk:= pdfbox:bookmark($input?this,$pdf) - let $bk:= if($bk?hasChildren) - then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) + let $bookmark:=$input?this + let $bk:=map{ + "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), + "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} + } + + let $bk:= if(PDOutlineItem:hasChildren($bookmark)) + then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($bookmark)) return map:merge(($bk,map:entry("children",$kids))) else $bk return map{ @@ -756,14 +754,14 @@ as map(*){ function($output,$pos) { empty($output?this) } ) } -PDF outline in xml formatpdfbox:outline-xmlfunction pdfbox:outline-xml ( $pdf as item() ) as element(outline)? { let $outline:=pdfbox:outline($pdf) return if(exists($outline)) then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline> else () }pdfitem()element(outline)org.expkg_zone58.Pdfbox3outlinehttp://www.w3.org/2005/xpath-functionsexistsorg.expkg_zone58.Pdfbox3bookmark-xmlfunction pdfbox:outline-xml($pdf as item()) +PDF outline in xml formatpdfbox:outline-xmlfunction pdfbox:outline-xml ( $pdf as item() ) as element(outline)?pdfitem()element(outline)org.expkg_zone58.Pdfbox3outlinehttp://www.w3.org/2005/xpath-functionsexistsorg.expkg_zone58.Pdfbox3bookmark-xmlfunction pdfbox:outline-xml($pdf as item()) as element(outline)?{ let $outline:=pdfbox:outline($pdf) return if(exists($outline)) then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline> else () } -Convert outline map to XMLpdfbox:bookmark-xmlfunction pdfbox:bookmark-xml ( $outline as map(*)* ) as element(bookmark)* { $outline! <bookmark title="{?title}" index="{?index}"> {?children!pdfbox:bookmark-xml(.)} </bookmark> }outlinemap(*)element(bookmark)org.expkg_zone58.Pdfbox3bookmark-xmlfunction pdfbox:bookmark-xml($outline as map(*)*) +Convert outline map to XMLpdfbox:bookmark-xmlfunction pdfbox:bookmark-xml ( $outline as map(*)* ) as element(bookmark)*outlinemap(*)element(bookmark)org.expkg_zone58.Pdfbox3bookmark-xmlfunction pdfbox:bookmark-xml($outline as map(*)*) as element(bookmark)* { $outline! @@ -771,18 +769,7 @@ as element(bookmark)* {?children!pdfbox:bookmark-xml(.)} </bookmark> } -Return bookmark info for $bookmark -map{index:..,title:..,hasChildren:..}pdfbox:bookmarkfunction pdfbox:bookmark ( $bookmark as item(),$pdf as item() ) as map(*) { map{ "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} (:=>translate("�",""), :), "hasChildren": PDOutlineItem:hasChildren($bookmark) } }bookmarkitem()pdfitem()map(*)java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemfindDestinationPagejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemgetTitlejava:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItemhasChildrenfunction pdfbox:bookmark($bookmark as item(),$pdf as item()) -as map(*) -{ - map{ - "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), - "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} - (:=>translate("�",""), :), - "hasChildren": PDOutlineItem:hasChildren($bookmark) - } -} -pageIndex of $page in $pdfpdfbox:find-pagefunction pdfbox:find-page ( $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), $pdf as item() ) as item()? { if(exists($page)) then PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPages() =>PDPageTree:indexOf($page) }pageitem()pdfitem()item()http://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:find-page( +pageIndex of $page in $pdfpdfbox:find-pagefunction pdfbox:find-page ( $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), $pdf as item() ) as item()?pageitem()pdfitem()item()http://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:find-page( $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), $pdf as item()) as item()? @@ -793,14 +780,14 @@ as item()? =>PDPageTree:indexOf($page) } Return new PDF doc with pages from $start to $end as xs:base64Binary, (1 based) -$start first page to include$end last page to includepdfbox:extract-rangefunction pdfbox:extract-range ( $pdf as item(), $start as xs:integer,$end as xs:integer ) as xs:base64Binary { let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() return (pdfbox:binary($a),pdfbox:close($a)) }pdfitem()startxs:integerendxs:integerxs:base64Binaryjava:org.apache.pdfbox.multipdf.PageExtractorneworg.expkg_zone58.Pdfbox3binaryorg.expkg_zone58.Pdfbox3closefunction pdfbox:extract-range($pdf as item(), +$start first page to include$end last page to includepdfbox:extract-rangefunction pdfbox:extract-range ( $pdf as item(), $start as xs:integer,$end as xs:integer ) as xs:base64Binarypdfitem()startxs:integerendxs:integerxs:base64Binaryjava:org.apache.pdfbox.multipdf.PageExtractorneworg.expkg_zone58.Pdfbox3binaryorg.expkg_zone58.Pdfbox3closefunction pdfbox:extract-range($pdf as item(), $start as xs:integer,$end as xs:integer) as xs:base64Binary { let $a:=PageExtractor:new($pdf, $start, $end) =>PageExtractor:extract() return (pdfbox:binary($a),pdfbox:close($a)) } -The number of labels defined in PDFpdfbox:number-of-labelsfunction pdfbox:number-of-labels ( $pdf as item() ) as xs:integer { let $labels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() return if(exists($labels)) then PDPageLabels:getPageRangeCount($labels) else 0 }pdfitem()xs:integerjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetPageRangeCountfunction pdfbox:number-of-labels($pdf as item()) +The number of labels defined in PDFpdfbox:number-of-labelsfunction pdfbox:number-of-labels ( $pdf as item() ) as xs:integerpdfitem()xs:integerjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCataloghttp://www.w3.org/2005/xpath-functionsexistsjava:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetPageRangeCountfunction pdfbox:number-of-labels($pdf as item()) as xs:integer { let $labels:=PDDocument:getDocumentCatalog($pdf) @@ -811,7 +798,7 @@ as xs:integer } pageLabel for every page from derived from page-ranges The returned sequence will contain at MOST as much entries as the document has pages. -https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-exampleshttps://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-filespdfbox:labels-by-pagefunction pdfbox:labels-by-page ( $pdf as item() ) as xs:string* { PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() =>PDPageLabels:getLabelsByPageIndices() }pdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:labels-by-page($pdf as item()) +https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-exampleshttps://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-filespdfbox:labels-by-pagefunction pdfbox:labels-by-page ( $pdf as item() ) as xs:string*pdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:labels-by-page($pdf as item()) as xs:string* { PDDocument:getDocumentCatalog($pdf) @@ -819,7 +806,7 @@ as xs:string* =>PDPageLabels:getLabelsByPageIndices() } sequence of label ranges defined in PDF as formatted strings -a custom representation of the labels e.g "0-*Cover,1r,11D"pdfbox:labels-as-stringfunction pdfbox:labels-as-string ( $pdf as item() ) as xs:string { let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() return $pagelabels !(0 to pdfbox:number-of-pages($pdf)-1) !pdfbox:label-as-string($pagelabels,.)=>string-join("&#10;") }pdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogorg.expkg_zone58.Pdfbox3number-of-pagesorg.expkg_zone58.Pdfbox3label-as-stringfunction pdfbox:labels-as-string($pdf as item()) +a custom representation of the labels e.g "0-*Cover,1r,11D"pdfbox:labels-as-stringfunction pdfbox:labels-as-string ( $pdf as item() ) as xs:stringpdfitem()xs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogorg.expkg_zone58.Pdfbox3number-of-pagesorg.expkg_zone58.Pdfbox3label-as-stringfunction pdfbox:labels-as-string($pdf as item()) as xs:string{ let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() @@ -828,12 +815,12 @@ as xs:string{ !pdfbox:label-as-string($pagelabels,.)=>string-join("&#10;") } -get pagelabels existpdfbox:page-labelsfunction pdfbox:page-labels ( $pdf ) { PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() }pdfjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:page-labels($pdf) +get pagelabels existpdfbox:page-labelsfunction pdfbox:page-labels ( $pdf )pdfjava:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogfunction pdfbox:page-labels($pdf) { PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() } -label for $page formated as string, empty if nonepdfbox:label-as-stringfunction pdfbox:label-as-string ( $pagelabels,$page as xs:integer ) as xs:string? { let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) return if(empty($label)) then () else let $start:= PDPageLabelRange:getStart($label) let $style := PDPageLabelRange:getStyle($label) let $prefix:= PDPageLabelRange:getPrefix($label) return string-join(($page, if(empty($style)) then "-" else $style, if(($start eq 1)) then "" else $start, if(exists($prefix)) then '*' || $prefix (:TODO double " :) )) }pagelabelspagexs:integerxs:stringjava:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetPageLabelRangehttp://www.w3.org/2005/xpath-functionsemptyjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStartjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStylejava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetPrefixhttp://www.w3.org/2005/xpath-functionsstring-joinhttp://www.w3.org/2005/xpath-functionsemptyhttp://www.w3.org/2005/xpath-functionsexistsfunction pdfbox:label-as-string($pagelabels,$page as xs:integer) +label for $page formated as string, empty if nonepdfbox:label-as-stringfunction pdfbox:label-as-string ( $pagelabels,$page as xs:integer ) as xs:string?pagelabelspagexs:integerxs:stringjava:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetPageLabelRangehttp://www.w3.org/2005/xpath-functionsemptyjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStartjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStylejava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetPrefixhttp://www.w3.org/2005/xpath-functionsstring-joinhttp://www.w3.org/2005/xpath-functionsemptyhttp://www.w3.org/2005/xpath-functionsexistsfunction pdfbox:label-as-string($pagelabels,$page as xs:integer) as xs:string?{ let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) return if(empty($label)) @@ -848,7 +835,7 @@ as xs:string?{ if(exists($prefix)) then '*' || $prefix (:TODO double " :) )) } -sequence of maps for each label/page range defined in $pdfpdfbox:labels-as-mapfunction pdfbox:labels-as-map ( $pdf as item() ) as map(*)* { let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() return $pagelabels !(0 to pdfbox:number-of-pages($pdf)-1) !pdfbox:label-as-map($pagelabels,.) }pdfitem()map(*)java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogorg.expkg_zone58.Pdfbox3number-of-pagesorg.expkg_zone58.Pdfbox3label-as-mapfunction pdfbox:labels-as-map($pdf as item()) +sequence of maps for each label/page range defined in $pdfpdfbox:labels-as-mapfunction pdfbox:labels-as-map ( $pdf as item() ) as map(*)*pdfitem()map(*)java:org.apache.pdfbox.pdmodel.PDDocumentgetDocumentCatalogorg.expkg_zone58.Pdfbox3number-of-pagesorg.expkg_zone58.Pdfbox3label-as-mapfunction pdfbox:labels-as-map($pdf as item()) as map(*)*{ let $pagelabels:=PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPageLabels() @@ -856,7 +843,7 @@ as map(*)*{ !(0 to pdfbox:number-of-pages($pdf)-1) !pdfbox:label-as-map($pagelabels,.) } -label/page-range for $page as mappdfbox:label-as-mapfunction pdfbox:label-as-map ( $pagelabels,$page as xs:integer ) as map(*) { let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) return if(empty($label)) then () else map{ "index": $page, "prefix": PDPageLabelRange:getPrefix($label), "start": PDPageLabelRange:getStart($label), "style": PDPageLabelRange:getStyle($label) } }pagelabelspagexs:integermap(*)java:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetPageLabelRangehttp://www.w3.org/2005/xpath-functionsemptyjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetPrefixjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStartjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStylefunction pdfbox:label-as-map($pagelabels,$page as xs:integer) +label/page-range for $page as mappdfbox:label-as-mapfunction pdfbox:label-as-map ( $pagelabels,$page as xs:integer ) as map(*)pagelabelspagexs:integermap(*)java:org.apache.pdfbox.pdmodel.common.PDPageLabelsgetPageLabelRangehttp://www.w3.org/2005/xpath-functionsemptyjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetPrefixjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStartjava:org.apache.pdfbox.pdmodel.common.PDPageLabelRangegetStylefunction pdfbox:label-as-map($pagelabels,$page as xs:integer) as map(*) { let $label:=PDPageLabels:getPageLabelRange($pagelabels,$page) @@ -869,7 +856,7 @@ as map(*) "style": PDPageLabelRange:getStyle($label) } } -return text on $pageNopdfbox:page-textfunction pdfbox:page-text ( $pdf as item(), $pageNo as xs:integer ) as xs:string { let $tStripper := (# db:wrapjava instance #) { PDFTextStripper:new() => PDFTextStripper:setStartPage($pageNo) => PDFTextStripper:setEndPage($pageNo) } return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} }pdfitem()pageNoxs:integerxs:stringjava:org.apache.pdfbox.text.PDFTextStrippernewjava:org.apache.pdfbox.text.PDFTextStrippergetTextfunction pdfbox:page-text($pdf as item(), $pageNo as xs:integer) +return text on $pageNopdfbox:page-textfunction pdfbox:page-text ( $pdf as item(), $pageNo as xs:integer ) as xs:stringpdfitem()pageNoxs:integerxs:stringjava:org.apache.pdfbox.text.PDFTextStrippernewjava:org.apache.pdfbox.text.PDFTextStrippergetTextfunction pdfbox:page-text($pdf as item(), $pageNo as xs:integer) as xs:string{ let $tStripper := (# db:wrapjava instance #) { PDFTextStripper:new() @@ -879,17 +866,17 @@ as xs:string{ return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} } Return size of $pageNo (zero based) -e.g. [0.0,0.0,168.0,239.52]pdfbox:page-media-boxfunction pdfbox:page-media-box ( $pdf as item(), $pageNo as xs:integer ) as xs:string { PDDocument:getPage($pdf, $pageNo) =>PDPage:getMediaBox() =>PDRectangle:toString() }pdfitem()pageNoxs:integerxs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetPagefunction pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) +e.g. [0.0,0.0,168.0,239.52]pdfbox:page-media-boxfunction pdfbox:page-media-box ( $pdf as item(), $pageNo as xs:integer ) as xs:stringpdfitem()pageNoxs:integerxs:stringjava:org.apache.pdfbox.pdmodel.PDDocumentgetPagefunction pdfbox:page-media-box($pdf as item(), $pageNo as xs:integer) as xs:string{ PDDocument:getPage($pdf, $pageNo) =>PDPage:getMediaBox() =>PDRectangle:toString() } -Version of Apache Pdfbox in use e.g. "3.0.4"pdfbox:versionfunction pdfbox:version ( ) as xs:string { Q{java:org.apache.pdfbox.util.Version}getVersion() }xs:stringjava:org.apache.pdfbox.util.VersiongetVersionfunction pdfbox:version() +Version of Apache Pdfbox in use e.g. "3.0.4"pdfbox:versionfunction pdfbox:version ( ) as xs:stringxs:stringjava:org.apache.pdfbox.util.VersiongetVersionfunction pdfbox:version() as xs:string{ Q{java:org.apache.pdfbox.util.Version}getVersion() } -Convert datepdfbox:gregToISOfunction pdfbox:gregToISO ( $item as item()? ) as xs:string? { if(exists($item)) then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() else () }itemitem()xs:stringhttp://www.w3.org/2005/xpath-functionsexistsjava:java.util.GregorianCalendartoZonedDateTimefunction pdfbox:gregToISO($item as item()?) +Convert datepdfbox:gregToISOfunction pdfbox:gregToISO ( $item as item()? ) as xs:string?itemitem()xs:stringhttp://www.w3.org/2005/xpath-functionsexistsjava:java.util.GregorianCalendartoZonedDateTimefunction pdfbox:gregToISO($item as item()?) as xs:string?{ if(exists($item)) then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() @@ -897,7 +884,7 @@ as xs:string?{ } fn:do-until shim for BaseX 9+10 if fn:do-until not found use hof:until, note: $pos always zero -pdfbox:do-untilfunction pdfbox:do-until ( $input as item()*, $action as function(item()*, xs:integer) as item()*, $predicate as function(item()*, xs:integer) as xs:boolean? ) as item()* { let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3) return if(exists($fn)) then $fn($input,$action,$predicate) else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3) return if(exists($hof)) then $hof($predicate(?,0),$action(?,0),$input) else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") }inputitem()actionfunction(item()*, xs:integer) as item()*predicatefunction(item()*, xs:integer) as xs:boolean?item()http://www.w3.org/2005/xpath-functionsfunction-lookuphttp://www.w3.org/2005/xpath-functionsQNamehttp://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionsfunction-lookuphttp://www.w3.org/2005/xpath-functionsQNamehttp://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamefunction pdfbox:do-until( +pdfbox:do-untilfunction pdfbox:do-until ( $input as item()*, $action as function(item()*, xs:integer) as item()*, $predicate as function(item()*, xs:integer) as xs:boolean? ) as item()*inputitem()actionfunction(item()*, xs:integer) as item()*predicatefunction(item()*, xs:integer) as xs:boolean?item()http://www.w3.org/2005/xpath-functionsfunction-lookuphttp://www.w3.org/2005/xpath-functionsQNamehttp://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionsfunction-lookuphttp://www.w3.org/2005/xpath-functionsQNamehttp://www.w3.org/2005/xpath-functionsexistshttp://www.w3.org/2005/xpath-functionserrorhttp://www.w3.org/2001/XMLSchemaQNamefunction pdfbox:do-until( $input as item()*, $action as function(item()*, xs:integer) as item()*, $predicate as function(item()*, xs:integer) as xs:boolean? diff --git a/docs/xqdoc/modules/F000001/xqparse.xml b/docs/xqdoc/modules/F000001/xqparse.xml index c1564e9..48057f6 100644 --- a/docs/xqdoc/modules/F000001/xqparse.xml +++ b/docs/xqdoc/modules/F000001/xqparse.xml @@ -35,7 +35,7 @@ refer to the same concept. Also label and (page)range are used interchangably&#x declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; -declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle"; +declare namespace PDRectangle="java:org.apache.pdfbox.pdmodel.common.PDRectangle"; declare namespace File ="java:java.io.File"; @@ -58,11 +58,6 @@ e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) }; -(:~ open pdf using fetch:binary, returns pdf object :) -declare function pdfbox:open($pdfsrc as item()) -as item(){ -pdfbox:open($pdfsrc, map{}) -}; (:~ open pdf from file/url/binary, opts may have password , returns pdf object @param $pdfsrc a fetchable url or filepath, or xs:base64Binary item @@ -87,6 +82,13 @@ e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) } }; +(:~ open pdf from a location, returns pdf object :) +declare function pdfbox:open($pdfsrc as item()) +as item(){ +pdfbox:open($pdfsrc, map{}) +}; + + (:~ The version of the PDF specification used by $pdf e.g "1.4" returned as string to avoid float rounding issues :) @@ -95,13 +97,13 @@ returned as string to avoid float rounding issues PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() }; -(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) +(:~ Save pdf <code>$pdf</code> to filesystem at <code>$savepath</code> , returns $savepath :) declare function pdfbox:pdf-save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath }; -(:~ Create binary representation of $pdf object as xs:base64Binary :) +(:~ Create binary representation (xs:base64Binary) of <code>$pdf</code> object :) declare function pdfbox:binary($pdf as item()) as xs:base64Binary{ let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() @@ -198,12 +200,6 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }; -(:~ summary CSV style info for all properties for $pdfpaths -:) -declare function pdfbox:report($pdfpaths as xs:string*) -as map(*){ - pdfbox:report($pdfpaths,pdfbox:property-names()) -}; (:~ summary CSV style info for named $properties for PDFs in $pdfpaths @see https://docs.basex.org/main/CSV_Functions#xquery @@ -233,6 +229,13 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) } }; +(:~ summary CSV style info for all properties for $pdfpaths +:) +declare function pdfbox:report($pdfpaths as xs:string*) +as map(*){ + pdfbox:report($pdfpaths,pdfbox:property-names()) +}; + (:~ Convenience function to save report() data to file :) declare function pdfbox:report-save($data as map(*),$dest as xs:string) as empty-sequence(){ @@ -306,9 +309,14 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) map{"list":(),"this":$outlineItem}, function($input,$pos ) { - let $bk:= pdfbox:bookmark($input?this,$pdf) - let $bk:= if($bk?hasChildren) - then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) + let $bookmark:=$input?this + let $bk:=map{ + "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), + "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} + } + + let $bk:= if(PDOutlineItem:hasChildren($bookmark)) + then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($bookmark)) return map:merge(($bk,map:entry("children",$kids))) else $bk return map{ @@ -339,21 +347,6 @@ options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) </bookmark> }; -(:~ Return bookmark info for $bookmark -@return map{index:..,title:..,hasChildren:..} -:) -declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) -as map(*) -{ - map{ - "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), - "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} - (:=>translate("�",""), :), - "hasChildren": PDOutlineItem:hasChildren($bookmark) - } -}; - - (:~ pageIndex of $page in $pdf :) declare function pdfbox:find-page( $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), diff --git a/docs/xqdoc/restxq.html b/docs/xqdoc/restxq.html index 0584079..00b1d5f 100644 --- a/docs/xqdoc/restxq.html +++ b/docs/xqdoc/restxq.html @@ -7,4 +7,4 @@ Contents
                              1. 1 Summary
                              2. 2 Rest Paths

                              Summary

                              No RESTXQ usage

                              Related documents
                              ViewDescriptionFormat
                              reportIndex of sourcesxhtml
                              importsSummary of import usagexhtml
                              imports-diagProject wide module imports as html mermaid class diagramhtml5
                              imports-diag.mmdProject wide module imports as a mermaid class diagramtext
                              annotationsSummary of XQuery annotation usexhtml
                              xqdoca.xmlxqDocA run configuration report (XML)xml
                              xqdoc-validatevalidate generated xqdoc filesxml

                              Rest interface paths

                              \ No newline at end of file +   on Monday, 9th June 2025

                              \ No newline at end of file diff --git a/docs/xqdoc/validation-report.xml b/docs/xqdoc/validation-report.xml index 4ee2c65..af1570e 100644 --- a/docs/xqdoc/validation-report.xml +++ b/docs/xqdoc/validation-report.xml @@ -1 +1 @@ -valid \ No newline at end of file +valid \ No newline at end of file diff --git a/docs/xqdoc/xqdoca.xml b/docs/xqdoc/xqdoca.xml index 1bacc45..736c3e1 100644 --- a/docs/xqdoc/xqdoca.xml +++ b/docs/xqdoc/xqdoca.xml @@ -1,4 +1,4 @@ -0.9.1docs/xqdoc/ +0.9.1docs/xqdoc/ report restxq imports @@ -10,4 +10,4 @@ module xqdoc xqparse - basex*.xqm,*.xq,*.xquerysrcsrc/truetrue1.1true \ No newline at end of file + basextrue*.xqm,*.xq,*.xquerysrcsrc/truetrue1.1true \ No newline at end of file diff --git a/package.json b/package.json index ccaffd3..a8c2def 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdfbox", - "version": "0.4.0", + "version": "0.5.0", "description": "A BaseX interface to Apache Pdfbox version 3", "main": "src/Pdfbox3.xqm", "homepage": "https://github.com/expkg-zone58/pdfbox#readme", @@ -8,9 +8,9 @@ "doc": "docs" }, "scripts": { + "xar build": "%BASEX10%/bin/basex scripts/make-xar.xq", "test": "%BASEX10%/bin/basex -Wt tests", - "docs": "xqdoca", - "build": "%BASEX10%/bin/basex scripts/make-xar.xq" + "docs build": "xqdoca" }, "keywords": [ "pdf", diff --git a/readme.md b/readme.md index 17d1a35..d3d53ac 100644 --- a/readme.md +++ b/readme.md @@ -29,7 +29,7 @@ The features focus on extracting information from PDFs rather than creation or e * Form processing ## Documentation -* Function [documentation](doc.md) +* Function [documentation](docs/guide.md) * The Apache Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful. # Install diff --git a/samples.pdf/readme.md b/samples.pdf/readme.md index 1cc64e0..6d57d71 100644 --- a/samples.pdf/readme.md +++ b/samples.pdf/readme.md @@ -5,8 +5,8 @@ |------|-----------|--------|----------|---| |[BaseX100.pdf](BaseX100.pdf)||✅||https://files.basex.org/releases/10.0/BaseX100.pdf| |[icelandic-dictionary.pdf](icelandic-dictionary.pdf)|✅|| |http://css4.pub/2015/icelandic/dictionary.pdf| -|[page-numbers.pdf](page-numbers.pdf)||✅||https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers| -|[page-numbers-password.pdf](page-numbers-password.pdf)||✅|✅(password)|https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers| +|[page-numbers.pdf](page-numbers.pdf)||✅||https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers.pdf| +|[page-numbers-password.pdf](page-numbers-password.pdf)||✅|✅(password)|https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers.pdf| |[Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans](Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans-Final-Report-November-2021.pdf)|✅|||https://www.lse.ac.uk/News/News-Assets/PDFs/2021/Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans-Final-Report-November-2021.pdf| |[Legal RAG Hallucinations](Legal_RAG_Hallucinations.pdf)|✅|||https://law.stanford.edu/wp-content/uploads/2024/05/Legal_RAG_Hallucinations.pdf| diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index 572d067..25f5b56 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -35,7 +35,7 @@ declare namespace rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; -declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle"; +declare namespace PDRectangle="java:org.apache.pdfbox.pdmodel.common.PDRectangle"; declare namespace File ="java:java.io.File"; @@ -58,11 +58,6 @@ as item()*{ }; -(:~ open pdf using fetch:binary, returns pdf object :) -declare function pdfbox:open($pdfsrc as item()) -as item(){ -pdfbox:open($pdfsrc, map{}) -}; (:~ open pdf from file/url/binary, opts may have password , returns pdf object @param $pdfsrc a fetchable url or filepath, or xs:base64Binary item @@ -87,6 +82,13 @@ as item(){ } }; +(:~ open pdf from a location, returns pdf object :) +declare function pdfbox:open($pdfsrc as item()) +as item(){ +pdfbox:open($pdfsrc, map{}) +}; + + (:~ The version of the PDF specification used by $pdf e.g "1.4" returned as string to avoid float rounding issues :) @@ -95,13 +97,13 @@ as xs:string{ PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() }; -(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) +(:~ Save pdf $pdf to filesystem at $savepath , returns $savepath :) declare function pdfbox:pdf-save($pdf as item(),$savepath as xs:string) as xs:string{ PDDocument:save($pdf, File:new($savepath)),$savepath }; -(:~ Create binary representation of $pdf object as xs:base64Binary :) +(:~ Create binary representation (xs:base64Binary) of $pdf object :) declare function pdfbox:binary($pdf as item()) as xs:base64Binary{ let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() @@ -198,12 +200,6 @@ as item()*{ else error(xs:QName('pdfbox:property'),concat("Property '",$property,"' not defined.")) }; -(:~ summary CSV style info for all properties for $pdfpaths -:) -declare function pdfbox:report($pdfpaths as xs:string*) -as map(*){ - pdfbox:report($pdfpaths,pdfbox:property-names()) -}; (:~ summary CSV style info for named $properties for PDFs in $pdfpaths @see https://docs.basex.org/main/CSV_Functions#xquery @@ -233,6 +229,13 @@ as map(*){ } }; +(:~ summary CSV style info for all properties for $pdfpaths +:) +declare function pdfbox:report($pdfpaths as xs:string*) +as map(*){ + pdfbox:report($pdfpaths,pdfbox:property-names()) +}; + (:~ Convenience function to save report() data to file :) declare function pdfbox:report-save($data as map(*),$dest as xs:string) as empty-sequence(){ @@ -306,9 +309,14 @@ as map(*){ map{"list":(),"this":$outlineItem}, function($input,$pos ) { - let $bk:= pdfbox:bookmark($input?this,$pdf) - let $bk:= if($bk?hasChildren) - then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) + let $bookmark:=$input?this + let $bk:=map{ + "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), + "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} + } + + let $bk:= if(PDOutlineItem:hasChildren($bookmark)) + then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($bookmark)) return map:merge(($bk,map:entry("children",$kids))) else $bk return map{ @@ -339,21 +347,6 @@ as element(bookmark)* }; -(:~ Return bookmark info for $bookmark -@return map{index:..,title:..,hasChildren:..} -:) -declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item()) -as map(*) -{ - map{ - "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf), - "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} - (:=>translate("�",""), :), - "hasChildren": PDOutlineItem:hasChildren($bookmark) - } -}; - - (:~ pageIndex of $page in $pdf :) declare function pdfbox:find-page( $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),