diff --git a/.gitea/workflows/ci-basex.yaml b/.gitea/workflows/ci-basex.yaml index 6d1b1f7..870db26 100644 --- a/.gitea/workflows/ci-basex.yaml +++ b/.gitea/workflows/ci-basex.yaml @@ -38,7 +38,7 @@ jobs: - name: Verify BaseX installation run: | - basex -v + basex -c "SHOW OPTIONS" - name: Checkout repository uses: actions/checkout@v4 @@ -47,7 +47,8 @@ jobs: - name: Build package run: | - basex scripts/install.bxs + basex scripts/make-xar.xq + basex scripts/repo-install.xq - name: Run tests run: | diff --git a/.gitea/workflows/trigger/release b/.gitea/workflows/trigger/release index e29f10b..676895e 100644 --- a/.gitea/workflows/trigger/release +++ b/.gitea/workflows/trigger/release @@ -3,4 +3,4 @@ c Xyxh 4456 -9.7.4 xyxz \ No newline at end of file +9.7.4 xyxz0 \ No newline at end of file diff --git a/.github/workflows/ci-basex.yaml b/.github/workflows/ci-basex.yaml index bcc68ab..37c5eca 100644 --- a/.github/workflows/ci-basex.yaml +++ b/.github/workflows/ci-basex.yaml @@ -35,7 +35,8 @@ jobs: - name: Build package run: | - basex scripts/install.bxs + basex scripts/make-xar.xq + basex scripts/repo-install.xq - name: run tests run: | diff --git a/changelog.md b/changelog.md new file mode 100644 index 0000000..9c59f4a --- /dev/null +++ b/changelog.md @@ -0,0 +1,3 @@ +## 0.1.5 2025-02-10 +* Add `isEncrypted` +* Rename `open` to `open-file` \ No newline at end of file diff --git a/package.xml b/package.xml deleted file mode 100644 index d18c01f..0000000 --- a/package.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar - - - org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar - - - org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar - - - commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar - - \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..ad2c571 --- /dev/null +++ b/readme.md @@ -0,0 +1,39 @@ +# Pdfbox +A BaseX interface for [Pdfbox](https://pdfbox.apache.org/) version 3. +It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7. Note: currently (v0.1.5) also works on V9.7 + +* The Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful. +## Features + +The features focus on extracting information from PDFs rather than creation or editing. + +* read PDF page count. +* read any PDF outline and return as map(s) or XML. +* read pagelabels. +* read page text. +* save pdf page range to a new pdf. +* save image of rendered pdf page. + + + +# Install +Pre-built `pdfbox-x.y.z.zar` files are available on the releases page. They can be installed using the standard respository functions or using the GUI. + +# Usage +```xquery +import module namespace pdfbox="org.expkg_zone58.Pdfbox3"; + +pdfbox:with-pdf("...path/to/pdf.pdf", + function($pdf){ + (1 to pdfbox:page-count($pdf))!pdfbox:page-text($pdf,.) + } +) +``` + +## Build + +* `scripts/make-xar.xq` packages the required `jar`s and `xqm` files to a `xar` file in the `dist` folder. + +### Action support + +The workflow `ci-basex.yaml` builds and tests the package. This can be used as an action on [github](https://github.com/features/actions), or on a local [gitea](https://docs.gitea.com/usage/actions/overview) installation. diff --git a/scripts/install.bxs b/scripts/install.bxs deleted file mode 100644 index 59ea7eb..0000000 --- a/scripts/install.bxs +++ /dev/null @@ -1,8 +0,0 @@ -# run query -XQUERY "make xar.." -RUN make-xar.xq -XQUERY "Repo install.." -RUN repo-install.xq -REPO LIST - - diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index 22e159a..84ab1c2 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -28,14 +28,7 @@ declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; declare namespace File ="java:java.io.File"; -declare variable $pdfbox:package-version:="0.1.2"; -(:~ version of Apache Pdfbox in use e.g. "3.0.4" -:) -declare function pdfbox:version() -as xs:string{ - Q{java:org.apache.pdfbox.util.Version}getVersion() -}; (:~ with-document pattern: open pdf,apply function, close pdf creates a local pdfobject and ensures it is closed after use @@ -44,7 +37,7 @@ e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) declare function pdfbox:with-pdf($src as xs:string, $fn as function(item())as item()*) as item()*{ - let $pdf:=pdfbox:open($src) + let $pdf:=pdfbox:open-file($src) return try{ $fn($pdf),pdfbox:close($pdf) } catch *{ @@ -54,12 +47,12 @@ as item()*{ }; (:~ open pdf, returns pdf object :) -declare function pdfbox:open($pdfpath as xs:string) +declare function pdfbox:open-file($pdfpath as xs:string) as item(){ try{ Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath)) } catch *{ - error(xs:QName("pdfbox:open"),"Failed to open: " || $pdfpath) + error(xs:QName("pdfbox:open-file"),"Failed to open: " || $pdfpath) } }; @@ -122,7 +115,7 @@ as map(*){ (:~ summary info as map for $pdfpath :) declare function pdfbox:report($pdfpath as xs:string) as map(*){ - let $pdf:=pdfbox:open($pdfpath) + let $pdf:=pdfbox:open-file($pdfpath) return (map{ "file": $pdfpath, "pages": pdfbox:page-count($pdf), @@ -144,6 +137,12 @@ as xs:boolean{ } }; +(:~ true if $pdf is encrypted* :) +declare function pdfbox:isEncrypted($pdf as item()) +as xs:boolean{ + PDDocument:isEncrypted($pdf) +}; + (:~ outline for $pdf as map()* :) declare function pdfbox:outline($pdf as item()) as map(*)*{ @@ -268,6 +267,12 @@ as xs:string{ return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)} }; +(:~ version of Apache Pdfbox in use e.g. "3.0.4" :) +declare function pdfbox:version() +as xs:string{ + Q{java:org.apache.pdfbox.util.Version}getVersion() +}; + (:~ convert date :) declare %private function pdfbox:gregToISO($item as item())