This commit is contained in:
parent
5c9e32d119
commit
9f0bed7cd8
8 changed files with 64 additions and 40 deletions
|
@ -38,7 +38,7 @@ jobs:
|
||||||
|
|
||||||
- name: Verify BaseX installation
|
- name: Verify BaseX installation
|
||||||
run: |
|
run: |
|
||||||
basex -v
|
basex -c "SHOW OPTIONS"
|
||||||
|
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
@ -47,7 +47,8 @@ jobs:
|
||||||
|
|
||||||
- name: Build package
|
- name: Build package
|
||||||
run: |
|
run: |
|
||||||
basex scripts/install.bxs
|
basex scripts/make-xar.xq
|
||||||
|
basex scripts/repo-install.xq
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
|
|
|
@ -3,4 +3,4 @@
|
||||||
c
|
c
|
||||||
Xyxh
|
Xyxh
|
||||||
4456
|
4456
|
||||||
9.7.4 xyxz
|
9.7.4 xyxz0
|
3
.github/workflows/ci-basex.yaml
vendored
3
.github/workflows/ci-basex.yaml
vendored
|
@ -35,7 +35,8 @@ jobs:
|
||||||
|
|
||||||
- name: Build package
|
- name: Build package
|
||||||
run: |
|
run: |
|
||||||
basex scripts/install.bxs
|
basex scripts/make-xar.xq
|
||||||
|
basex scripts/repo-install.xq
|
||||||
|
|
||||||
- name: run tests
|
- name: run tests
|
||||||
run: |
|
run: |
|
||||||
|
|
3
changelog.md
Normal file
3
changelog.md
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
## 0.1.5 2025-02-10
|
||||||
|
* Add `isEncrypted`
|
||||||
|
* Rename `open` to `open-file`
|
17
package.xml
17
package.xml
|
@ -1,17 +0,0 @@
|
||||||
<package name="org.expkg_zone58.Pdfbox3"
|
|
||||||
abbrev="pdfbox"
|
|
||||||
version="0.1.1"
|
|
||||||
spec="1.0">
|
|
||||||
<component name="pdfbox-3.0.4.jar">
|
|
||||||
<source type="maven">org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar</source>
|
|
||||||
</component>
|
|
||||||
<component name="pdfbox-io-3.0.4.jar">
|
|
||||||
<source type="maven">org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar</source>
|
|
||||||
</component>
|
|
||||||
<component name="fontbox-3.0.4.jar">
|
|
||||||
<source type="maven">org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar</source>
|
|
||||||
</component>
|
|
||||||
<component name="commons-logging-1.3.4.jar">
|
|
||||||
<source type="maven">commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar</source>
|
|
||||||
</component>
|
|
||||||
</package>
|
|
39
readme.md
Normal file
39
readme.md
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
# Pdfbox
|
||||||
|
A BaseX interface for [Pdfbox](https://pdfbox.apache.org/) version 3.
|
||||||
|
It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7. Note: currently (v0.1.5) also works on V9.7
|
||||||
|
|
||||||
|
* The Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful.
|
||||||
|
## Features
|
||||||
|
|
||||||
|
The features focus on extracting information from PDFs rather than creation or editing.
|
||||||
|
|
||||||
|
* read PDF page count.
|
||||||
|
* read any PDF outline and return as map(s) or XML.
|
||||||
|
* read pagelabels.
|
||||||
|
* read page text.
|
||||||
|
* save pdf page range to a new pdf.
|
||||||
|
* save image of rendered pdf page.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Install
|
||||||
|
Pre-built `pdfbox-x.y.z.zar` files are available on the releases page. They can be installed using the standard respository functions or using the GUI.
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
```xquery
|
||||||
|
import module namespace pdfbox="org.expkg_zone58.Pdfbox3";
|
||||||
|
|
||||||
|
pdfbox:with-pdf("...path/to/pdf.pdf",
|
||||||
|
function($pdf){
|
||||||
|
(1 to pdfbox:page-count($pdf))!pdfbox:page-text($pdf,.)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Build
|
||||||
|
|
||||||
|
* `scripts/make-xar.xq` packages the required `jar`s and `xqm` files to a `xar` file in the `dist` folder.
|
||||||
|
|
||||||
|
### Action support
|
||||||
|
|
||||||
|
The workflow `ci-basex.yaml` builds and tests the package. This can be used as an action on [github](https://github.com/features/actions), or on a local [gitea](https://docs.gitea.com/usage/actions/overview) installation.
|
|
@ -1,8 +0,0 @@
|
||||||
# run query
|
|
||||||
XQUERY "make xar.."
|
|
||||||
RUN make-xar.xq
|
|
||||||
XQUERY "Repo install.."
|
|
||||||
RUN repo-install.xq
|
|
||||||
REPO LIST
|
|
||||||
|
|
||||||
|
|
|
@ -28,14 +28,7 @@ declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
|
||||||
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
|
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
|
||||||
declare namespace File ="java:java.io.File";
|
declare namespace File ="java:java.io.File";
|
||||||
|
|
||||||
declare variable $pdfbox:package-version:="0.1.2";
|
|
||||||
|
|
||||||
(:~ version of Apache Pdfbox in use e.g. "3.0.4"
|
|
||||||
:)
|
|
||||||
declare function pdfbox:version()
|
|
||||||
as xs:string{
|
|
||||||
Q{java:org.apache.pdfbox.util.Version}getVersion()
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ with-document pattern: open pdf,apply function, close pdf
|
(:~ with-document pattern: open pdf,apply function, close pdf
|
||||||
creates a local pdfobject and ensures it is closed after use
|
creates a local pdfobject and ensures it is closed after use
|
||||||
|
@ -44,7 +37,7 @@ e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
|
||||||
declare function pdfbox:with-pdf($src as xs:string,
|
declare function pdfbox:with-pdf($src as xs:string,
|
||||||
$fn as function(item())as item()*)
|
$fn as function(item())as item()*)
|
||||||
as item()*{
|
as item()*{
|
||||||
let $pdf:=pdfbox:open($src)
|
let $pdf:=pdfbox:open-file($src)
|
||||||
return try{
|
return try{
|
||||||
$fn($pdf),pdfbox:close($pdf)
|
$fn($pdf),pdfbox:close($pdf)
|
||||||
} catch *{
|
} catch *{
|
||||||
|
@ -54,12 +47,12 @@ as item()*{
|
||||||
};
|
};
|
||||||
|
|
||||||
(:~ open pdf, returns pdf object :)
|
(:~ open pdf, returns pdf object :)
|
||||||
declare function pdfbox:open($pdfpath as xs:string)
|
declare function pdfbox:open-file($pdfpath as xs:string)
|
||||||
as item(){
|
as item(){
|
||||||
try{
|
try{
|
||||||
Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath))
|
Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath))
|
||||||
} catch *{
|
} catch *{
|
||||||
error(xs:QName("pdfbox:open"),"Failed to open: " || $pdfpath)
|
error(xs:QName("pdfbox:open-file"),"Failed to open: " || $pdfpath)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -122,7 +115,7 @@ as map(*){
|
||||||
(:~ summary info as map for $pdfpath :)
|
(:~ summary info as map for $pdfpath :)
|
||||||
declare function pdfbox:report($pdfpath as xs:string)
|
declare function pdfbox:report($pdfpath as xs:string)
|
||||||
as map(*){
|
as map(*){
|
||||||
let $pdf:=pdfbox:open($pdfpath)
|
let $pdf:=pdfbox:open-file($pdfpath)
|
||||||
return (map{
|
return (map{
|
||||||
"file": $pdfpath,
|
"file": $pdfpath,
|
||||||
"pages": pdfbox:page-count($pdf),
|
"pages": pdfbox:page-count($pdf),
|
||||||
|
@ -144,6 +137,12 @@ as xs:boolean{
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
(:~ true if $pdf is encrypted* :)
|
||||||
|
declare function pdfbox:isEncrypted($pdf as item())
|
||||||
|
as xs:boolean{
|
||||||
|
PDDocument:isEncrypted($pdf)
|
||||||
|
};
|
||||||
|
|
||||||
(:~ outline for $pdf as map()* :)
|
(:~ outline for $pdf as map()* :)
|
||||||
declare function pdfbox:outline($pdf as item())
|
declare function pdfbox:outline($pdf as item())
|
||||||
as map(*)*{
|
as map(*)*{
|
||||||
|
@ -268,6 +267,12 @@ as xs:string{
|
||||||
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
|
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
(:~ version of Apache Pdfbox in use e.g. "3.0.4" :)
|
||||||
|
declare function pdfbox:version()
|
||||||
|
as xs:string{
|
||||||
|
Q{java:org.apache.pdfbox.util.Version}getVersion()
|
||||||
|
};
|
||||||
|
|
||||||
(:~ convert date :)
|
(:~ convert date :)
|
||||||
declare %private
|
declare %private
|
||||||
function pdfbox:gregToISO($item as item())
|
function pdfbox:gregToISO($item as item())
|
||||||
|
|
Loading…
Add table
Reference in a new issue