From 4ea01764f94ad6b7cea25ef3e1182988e988406b Mon Sep 17 00:00:00 2001 From: Andy Bunce Date: Fri, 14 Feb 2025 11:01:20 +0000 Subject: [PATCH] [fix] #1 --- .gitea/workflows/trigger/base-act | 1 - .gitea/workflows/trigger/release | 3 +- changelog.md | 3 ++ doc.md | 5 -- package.json | 2 +- readme.md | 3 +- src/Pdfbox3.xqm | 87 +++++++++++++++---------------- 7 files changed, 49 insertions(+), 55 deletions(-) delete mode 100644 .gitea/workflows/trigger/base-act diff --git a/.gitea/workflows/trigger/base-act b/.gitea/workflows/trigger/base-act deleted file mode 100644 index f9c9657..0000000 --- a/.gitea/workflows/trigger/base-act +++ /dev/null @@ -1 +0,0 @@ -aabcdeggxcd \ No newline at end of file diff --git a/.gitea/workflows/trigger/release b/.gitea/workflows/trigger/release index a1b2e53..d936185 100644 --- a/.gitea/workflows/trigger/release +++ b/.gitea/workflows/trigger/release @@ -6,4 +6,5 @@ Xyxh 9.7.4 xyxz01x ab iiiisAasxs -x \ No newline at end of file +x +A \ No newline at end of file diff --git a/changelog.md b/changelog.md index 9c59f4a..31cf537 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,6 @@ +## 0.1.6 2025-02-14 +* Add `hasLabels` +* FIX #1 error if no labels ## 0.1.5 2025-02-10 * Add `isEncrypted` * Rename `open` to `open-file` \ No newline at end of file diff --git a/doc.md b/doc.md index 0d68c92..58fb074 100644 --- a/doc.md +++ b/doc.md @@ -225,7 +225,6 @@ Returns the version of the Apache PDFBox library in use. ## Notes -- Ensure that the `pdfbox-app-3.0.4.jar` (or a compatible version) is on the classpath. - The library is designed to work with BaseX 10.7+. - Some functions may throw errors if the PDF is encrypted or if the file cannot be opened. @@ -258,7 +257,3 @@ return pdfbox:metadata($pdf) let $pdf := pdfbox:open-file("path/to/document.pdf") return pdfbox:extract($pdf, 1, 3, "path/to/new/document.pdf") ``` - -## Conclusion - -The `Pdfbox3.xqm` library provides a powerful interface for working with PDF documents in XQuery. It allows you to extract text, render pages, extract metadata, and more. \ No newline at end of file diff --git a/package.json b/package.json index 9f21c73..7cba85c 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdfbox", - "version": "0.1.5", + "version": "0.1.6", "description": "A BaseX interface to Apache Pdfbox version 3", "main": "src/Pdfbox3.xqm", "homepage": "https://github.com/npm/example#readme", diff --git a/readme.md b/readme.md index 26e608a..5fde2fa 100644 --- a/readme.md +++ b/readme.md @@ -3,13 +3,12 @@ A `BaseX` interface for the `Apache Pdfbox library` version 3. The [Apache PDFBox® library](https://pdfbox.apache.org/) is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. -This interface is packaged in the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format. +This interface is packaged in the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format. The package includes the required Pdfbox jars. A test suite is available and workflow actions run this on BaseX 10.7 and 11.7. > [!NOTE] >Currently (v0.1.5) works with BaseX 9.7, but this may change with future versions. -* The Apache Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful. ## Features diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index 89feba0..4ed9d84 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -92,28 +92,23 @@ as xs:base64Binary{ }; declare variable $pdfbox:doc-info:=map{ - "title": PDDocumentInformation:getTitle#1, - "creator": PDDocumentInformation:getCreator#1, - "producer": PDDocumentInformation:getProducer#1, - "subject": PDDocumentInformation:getSubject#1, - "keywords": PDDocumentInformation:getKeywords#1, - "creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate#1), - "author": PDDocumentInformation:getAuthor#1 + "title": PDDocumentInformation:getTitle#1, + "author": PDDocumentInformation:getAuthor#1, + "creator": PDDocumentInformation:getCreator#1, + "producer": PDDocumentInformation:getProducer#1, + "subject": PDDocumentInformation:getSubject#1, + "keywords": PDDocumentInformation:getKeywords#1, + "creationdate": function($i){pdfbox:gregToISO(PDDocumentInformation:getCreationDate($i))}, + "modificationdate": function($i){pdfbox:gregToISO(PDDocumentInformation:getModificationDate($i))} }; (:~ map with document metadata :) declare function pdfbox:metadata($pdf as item()) as map(*){ let $info:=PDDocument:getDocumentInformation($pdf) - return map{ - "title": PDDocumentInformation:getTitle($info), - "creator": PDDocumentInformation:getCreator($info), - "producer": PDDocumentInformation:getProducer($info), - "subject": PDDocumentInformation:getSubject($info), - "keywords": PDDocumentInformation:getKeywords($info), - "creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate($info)), - "author": PDDocumentInformation:getAuthor($info) - } + return map:for-each($pdfbox:doc-info, + function($k,$v){map:entry($k,$pdfbox:doc-info($k)($info))}) + =>map:merge() }; (:~ summary info as map for $pdfpath :) @@ -124,27 +119,26 @@ as map(*){ "file": $pdfpath, "pages": pdfbox:page-count($pdf), "hasOutline": pdfbox:hasOutline($pdf), + "hasLabels": pdfbox:hasLabels($pdf), "specification":pdfbox:specification($pdf) },pdfbox:metadata($pdf) )=>map:merge() }; - (:~ true if $pdf has an outline for $pdf as map()* :) +(:~ true if $pdf has an outline :) declare function pdfbox:hasOutline($pdf as item()) as xs:boolean{ - (# db:wrapjava some #) { - let $outline:= - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getDocumentOutline() - - return exists($outline) - } + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getDocumentOutline() + =>exists() }; -(:~ true if $pdf is encrypted* :) -declare function pdfbox:isEncrypted($pdf as item()) +(:~ true if $pdf has Labels :) +declare function pdfbox:hasLabels($pdf as item()) as xs:boolean{ - PDDocument:isEncrypted($pdf) + PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + =>exists() }; (:~ outline for $pdf as map()* :) @@ -162,7 +156,6 @@ as map(*)*{ (:~ return bookmark info for children of $outlineItem as seq of maps :) declare function pdfbox:outline($pdf as item(),$outlineItem as item()?) - as map(*)*{ let $find as map(*):=pdfbox:_outline($pdf ,$outlineItem) return map:get($find,"list") @@ -176,15 +169,15 @@ as map(*){ map{"list":(),"this":$outlineItem}, function($input,$pos ) { - let $bk:= pdfbox:bookmark($input?this,$pdf) - let $bk:= if($bk?hasChildren) - then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) - return map:merge(($bk,map:entry("children",$kids))) - else $bk - return map{ - "list": ($input?list, $bk), - "this": PDOutlineItem:getNextSibling($input?this)} - }, + let $bk:= pdfbox:bookmark($input?this,$pdf) + let $bk:= if($bk?hasChildren) + then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this)) + return map:merge(($bk,map:entry("children",$kids))) + else $bk + return map{ + "list": ($input?list, $bk), + "this": PDOutlineItem:getNextSibling($input?this)} + }, function($output,$pos) { empty($output?this) } ) @@ -248,16 +241,18 @@ as xs:string }; -(:~ pageLabel for every page +(:~ pageLabel for every page or empty if none @see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples @see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files :) declare function pdfbox:labels($pdf as item()) as xs:string* { - PDDocument:getDocumentCatalog($pdf) - =>PDDocumentCatalog:getPageLabels() - =>PDPageLabels:getLabelsByPageIndices() + let $pagelabels:=PDDocument:getDocumentCatalog($pdf) + =>PDDocumentCatalog:getPageLabels() + return if(exists($pagelabels)) + then PDPageLabels:getLabelsByPageIndices($pagelabels) + else () }; (:~ return text on $pageNo :) @@ -279,9 +274,11 @@ as xs:string{ (:~ convert date :) declare %private -function pdfbox:gregToISO($item as item()) -as xs:string{ - Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() +function pdfbox:gregToISO($item as item()?) +as xs:string?{ + if(exists($item)) + then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() + else () }; (:~ fn:do-until shim for BaseX 9+10 @@ -299,6 +296,6 @@ declare %private function pdfbox:do-until( else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3) return if(exists($hof)) then $hof($predicate(?,0),$action(?,0),$input) - else error(xs:QName('pdfbox:do-until'),"No implementation found") + else error(xs:QName('pdfbox:do-until'),"No implementation do-until found") };