diff --git a/.gitea/workflows/ci-basex.yaml b/.gitea/workflows/ci-basex.yaml
index c67f4d3..fb0c099 100644
--- a/.gitea/workflows/ci-basex.yaml
+++ b/.gitea/workflows/ci-basex.yaml
@@ -11,7 +11,7 @@ on:
jobs:
test:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-latest
steps:
- name: Set up Java
uses: actions/setup-java@v4
diff --git a/.github/workflows/ci-basex.yaml b/.github/workflows/ci-basex.yaml
index 197988d..bcc68ab 100644
--- a/.github/workflows/ci-basex.yaml
+++ b/.github/workflows/ci-basex.yaml
@@ -8,7 +8,7 @@ on:
jobs:
test:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-latest
steps:
- name: Set up Java
uses: actions/setup-java@v4
diff --git a/README.md b/README.md
deleted file mode 100644
index a682f5c..0000000
--- a/README.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# Pdfbox
-A BaseX interface for [Pdfbox](https://pdfbox.apache.org/) version 3.
-It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7.
-
-* The Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful.
-## Features
-* read PDF page count.
-* read any PDF outline and return as map(s) or XML.
-* read pagelabels.
-* read page text.
-* save pdf page range to a new pdf.
-* save pdf page as an image.
-
-
-## Build
-
-* `scripts/make-xar.xq` packages the required `jar`s and `xqm` files to a `xar` file in the `dist` folder.
-
-### Action support
-
-The workflow `ci-basex.yaml` builds and tests the package. This can be used as an action on [github](https://github.com/features/actions), or on a local [gitea](https://docs.gitea.com/usage/actions/overview) installation.
diff --git a/docs/pdfbox.xqbk b/docs/pdfbox.xqbk
deleted file mode 100644
index d19e4fd..0000000
--- a/docs/pdfbox.xqbk
+++ /dev/null
@@ -1 +0,0 @@
-{"cells":[{"kind":1,"language":"markdown","value":"# PDFBox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"It comes with the useful PDF debug tool `java -jar debugger-app-3.0.2.jar`"},{"kind":1,"language":"markdown","value":"## Set up XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace bookpages = 'urn:bookpages' at \"../src/lib/bookpages.xqm\";\r\nimport module namespace pdfscrape = 'urn:pdfscrape' at \"../src/lib/pdfscrape.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};\r\ndeclare variable $PDF:= (: $samples?women=>file:resolve-path($config:data) :)\"C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO\\data\\drop-01e\\set\\2-6-1\\A5690C_1\\257107---Book_File-Web_PDF_9798400691218_486731.pdf\"; \r\n"},{"kind":1,"language":"markdown","value":" ## Check pdfbox version"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":1,"language":"markdown","value":"PDF specification version used by document"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:pdfVersion()"},{"kind":1,"language":"markdown","value":"# Page count for PDF"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:page-count()"},{"kind":1,"language":"markdown","value":"# save range to new pdf"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:extract(2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":1,"language":"markdown","value":"### sequence of maps"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:outline()"},{"kind":1,"language":"markdown","value":"XML"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:outline()=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:pageLabels()"},{"kind":1,"language":"markdown","value":"# getText from page index"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"# Page scraping"},{"kind":1,"language":"markdown","value":"## pdf scrape text analysis"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"## Inverted pageno map"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)=>pdfscrape:inverted-map()"},{"kind":1,"language":"markdown","value":"# Save images"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(99,1)\r\n=>pdfbox:imageSave(\"c:\\tmp\\page3.png\",\"png\")\r\n"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(3,0.25)\r\n=>pdfbox:imageBinary(\"jpg\")"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\n(: let $outline:=pdfbox:outline($doc) :)\r\nlet $count:=pdfbox:page-count($doc)\r\norder by $count \r\nreturn ``[`{$f}`: `{ $count }`]``"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f at $pos in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nreturn pdfbox:open(file:resolve-path($f,$a))\r\n=> pdfbox:pageAsImage(0,0.25)\r\n=> pdfbox:imageSave(``[c:\\tmp\\titles\\p`{$pos}`.gif]``,\"gif\")"}]}
\ No newline at end of file
diff --git a/package.json b/package.json
index ab537f8..1d5a398 100644
--- a/package.json
+++ b/package.json
@@ -1,8 +1,8 @@
{
"name": "pdfbox",
- "version": "0.1.1",
+ "version": "0.1.3",
"description": "A BaseX interface to Apache Pdfbox version 3",
- "main": "Pdfbox.xqm",
+ "main": "src/Pdfbox3.xqm",
"homepage": "https://github.com/npm/example#readme",
"directories": {
"doc": "docs"
@@ -20,6 +20,16 @@
"author": "Andy Bunce",
"license": "Apache-2.0",
"quodatum": {
- "random": true
+ "random": true,
+ "namespace": "org.expkg_zone58.Pdfbox3",
+ "main-class": "org.apache.pdfbox.pdmodel.PDDocument",
+ "download": "jars/",
+ "maven": [
+ "org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar",
+ "org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar",
+ "org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar",
+ "commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar"
+ ]
+
}
}
\ No newline at end of file
diff --git a/scripts/build.xqm b/scripts/build.xqm
index d7cf5dd..a020fab 100644
--- a/scripts/build.xqm
+++ b/scripts/build.xqm
@@ -1,11 +1,19 @@
-(:~ build utils for REPO packaging :)
-module namespace build = 'urn:quodatum:build1';
+(:~ build utils for REPO packaging
-(:~ create a flat fat jar from jars in $input-dir
-keeping only META-INF from $manifest-jar
:)
+module namespace build = 'urn:quodatum:build1';
+declare namespace bxpkg='http://www.basex.org/modules/pkg';
+declare namespace pkg='http://expath.org/ns/pkg';
+
+(:~ jar compress options :)
declare variable $build:archive-opts:= map { "format" : "zip", "algorithm" : "deflate" };
+declare variable $build:base:= file:resolve-path("../",static-base-uri())=>trace("base ");
+declare variable $build:PKG:=json:doc(file:resolve-path("package.json",$build:base),map{"format":"xquery"});
+
+(:~ return binary for fat jar from jars in $input-dir
+keeping only META-INF from $manifest-jar
+:)
declare function build:fatjar-from-folder($input-dir as xs:string,$manifest-jar as xs:string)
as xs:base64Binary {
let $fold :=
@@ -54,31 +62,77 @@ as xs:base64Binary{
archive:update($jar,$name,$file)
};
-declare function build:xar-create($base as xs:string)
+(:~ build basex.xml from package.json :)
+declare function build:basex.xml()
+as xs:string{
+``[
+ `{ build:jars("name")!concat('',.,'') }`
+ `{ $build:PKG?quodatum?main-class }`
+
+]``
+
+};
+
+(:~ expath-pkg.xml using package.json :)
+declare function build:expath-pkg.xml()
+as xs:string{
+ ``[
+ `{$build:PKG?description}`
+
+
+ `{$build:PKG?quodatum?namespace}`
+ `{$build:PKG?main=>replace("^.*/","")}`
+
+
+ ]``
+
+};
+
+declare function build:xar-create()
as xs:base64Binary{
+
let $entries:=
- build:xar-add(map{},file:resolve-path("jars/",$base),"content/")
- =>build:xar-add(file:resolve-path("src/Pdfbox3.xqm",$base),"content/")
- =>build:xar-add(file:resolve-path("src/metadata/",$base),"")
+ build:xar-add(map{},build:jars("content"),build:jars("download")!build:content(.))
+ =>build:xar-add("content/Pdfbox3.xqm",build:content("src/Pdfbox3.xqm"))
+ =>build:xar-add("expath-pkg.xml",convert:string-to-base64(build:expath-pkg.xml()))
+ =>build:xar-add("basex.xml",convert:string-to-base64(build:basex.xml()))
return archive:create($entries?name, $entries?content,$build:archive-opts)
};
-(:~ zip data for $dir
-:)
-declare function build:xar-add($map as map(*),$src as xs:string,$xar-dir as xs:string)
+(:~ content as base64Binary of $path :)
+declare function build:content($path as xs:string)
+as xs:base64Binary{
+file:resolve-path($path,$build:base)=>file:read-binary()
+};
+
+(:~ add (name,content) pairs to archive data :)
+declare function build:xar-add($map as map(*),$xar-path as xs:string*,$content as item()*)
as map(*){
-let $names:=if(file:is-dir($src))
- then file:list($src)[not(starts-with(.,'.'))]!concat($src,.)
- else $src
-return map:merge((
- $map,
- map{"name":$names!concat($xar-dir,file:name(.)),
- "content":$names!file:read-binary( .)}
- ),
- map{"duplicates":"combine"}
- )
+ map{"name": ($map?name,$xar-path), "content": ($map?content,$content)}
};
+(:~ path to created xar file :)
+declare function build:xar-path()
+as xs:string{
+ let $a:=``[dist/pdfbox-`{$build:PKG?version}`.xar]``
+ return file:resolve-path($a,$build:base)
+};
+
+declare function build:jars($style as xs:string)
+as xs:string*{
+let $src:=$build:PKG?quodatum?maven=>array:flatten()
+let $names:= $src!replace(.,"^.*/","")
+return switch($style)
+case "name" return $names
+case "download" return $names!concat($build:PKG?quodatum?download,.)
+case "content" return $names!concat("content/",.)
+default return $src
+};
+
(:~ download $files from $urls to $destdir:)
declare variable $build:REPO as xs:string external :="https://repo1.maven.org/maven2/";
declare function build:maven-download($urls as xs:string*,$destdir as xs:string)
diff --git a/scripts/install.bxs b/scripts/install.bxs
index 1bd050e..59ea7eb 100644
--- a/scripts/install.bxs
+++ b/scripts/install.bxs
@@ -2,7 +2,7 @@
XQUERY "make xar.."
RUN make-xar.xq
XQUERY "Repo install.."
-REPO INSTALL dist/pdfbox.xar
+RUN repo-install.xq
REPO LIST
diff --git a/scripts/make-xar.xq b/scripts/make-xar.xq
index 8ce21fb..3911b54 100644
--- a/scripts/make-xar.xq
+++ b/scripts/make-xar.xq
@@ -1,16 +1,8 @@
import module namespace build = 'urn:quodatum:build1' at 'build.xqm';
-declare variable $base:= file:resolve-path("../",static-base-uri())=>trace("base ");
-
-declare variable $maven-urls := (
-"org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar",
-"org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar",
-"org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar",
-"commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar"
-);
-let $_:=build:maven-download($maven-urls,$base || "jars/")
-let $xar:=build:xar-create($base)
-let $output-file := file:resolve-path("dist/pdfbox.xar",$base)
+let $_:=build:maven-download($build:PKG?quodatum?maven=>array:flatten(),$build:base || "jars/")
+let $xar:=build:xar-create()
+let $output-file := build:xar-path()
return (build:write-binary($output-file, $xar),
trace($output-file,"xar: "))
diff --git a/scripts/repo-install.xq b/scripts/repo-install.xq
new file mode 100644
index 0000000..5e6b733
--- /dev/null
+++ b/scripts/repo-install.xq
@@ -0,0 +1,8 @@
+
+import module namespace build = 'urn:quodatum:build1' at 'build.xqm';
+
+let $output-file := file:resolve-path(`dist/pdfbox-{$build:PKG?version}.xar`,$build:base)
+return (
+ repo:install($output-file),
+ trace($output-file,"repo: ")
+ )
diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm
index 1eda28c..8f32111 100644
--- a/src/Pdfbox3.xqm
+++ b/src/Pdfbox3.xqm
@@ -28,14 +28,13 @@ declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
declare namespace File ="java:java.io.File";
-declare variable $pdfbox:package-version:="0.1.1";
+declare variable $pdfbox:package-version:="0.1.2";
-(:~ SemVer version of this package
-with build metadata for Apache Pdfbox in use e.g. "0.1.0+pdfbox3.0.4"
+(:~ version of Apache Pdfbox in use e.g. "3.0.4"
:)
declare function pdfbox:version()
as xs:string{
- $pdfbox:package-version ||"+pdfbox" || Q{java:org.apache.pdfbox.util.Version}getVersion()
+ Q{java:org.apache.pdfbox.util.Version}getVersion()
};
(:~ with-document pattern: open pdf,apply function, close pdf
@@ -49,7 +48,7 @@ as item()*{
return try{
$fn($pdf),pdfbox:close($pdf)
} catch *{
- pdfbox:close($pdf),error()
+ pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description)
}
};
@@ -189,14 +188,16 @@ as map(*){
};
(:~ outline as xml :)
-declare function pdfbox:outline-xml($outline as map(*)*)
-as element(outline){
+declare function pdfbox:outline-xml($pdf as item())
+as element(outline)?{
element outline {
- $outline!pdfbox:bookmark-xml(.)
+ let $outline:=pdfbox:outline($pdf)
+ return if(exists($outline))
+ then {$outline!pdfbox:bookmark-xml(.)}
}
};
-declare function pdfbox:bookmark-xml($outline as map(*)*)
+declare %private function pdfbox:bookmark-xml($outline as map(*)*)
as element(bookmark)*
{
$outline!
@@ -208,11 +209,11 @@ as element(bookmark)*
(:~ return bookmark info for children of $outlineItem
@return map like{index:,title:,hasChildren:}
:)
-declare function pdfbox:bookmark($bookmark as item(),$pdf as item())
+declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item())
as map(*)
{
map{
- "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:page-index($pdf),
+ "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf),
"title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}
(:=>translate("�",""), :),
"hasChildren": PDOutlineItem:hasChildren($bookmark)
@@ -221,7 +222,7 @@ as map(*)
(:~ pageIndex of $page in $pdf :)
-declare function pdfbox:page-index(
+declare function pdfbox:find-page(
$page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
$pdf as item())
as item()?
@@ -268,8 +269,6 @@ as xs:string{
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
};
-
-
(:~ convert date :)
declare %private
function pdfbox:gregToISO($item as item())
diff --git a/src/metadata/basex.xml b/src/metadata/basex.xml
deleted file mode 100644
index f4f884f..0000000
--- a/src/metadata/basex.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
- pdfbox-3.0.4.jar
- pdfbox-io-3.0.4.jar
- fontbox-3.0.4.jar
- commons-logging-1.3.4.jar
- org.apache.pdfbox.pdmodel.PDDocument
-
\ No newline at end of file
diff --git a/src/metadata/expath-pkg.xml b/src/metadata/expath-pkg.xml
deleted file mode 100644
index 82d2c9b..0000000
--- a/src/metadata/expath-pkg.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
- BaseX interface to Pdfbox (https://pdfbox.apache.org/) version 3
-
-
- org.expkg_zone58.Pdfbox3
- Pdfbox3.xqm
-
-
diff --git a/src/test/test.xqm b/src/test/test.xqm
index 3aacef2..3e58034 100644
--- a/src/test/test.xqm
+++ b/src/test/test.xqm
@@ -10,7 +10,7 @@ declare variable $test:base:=file:base-dir()=>file:parent()=>file:parent();
declare %unit:test
function test:pdfbox-version(){
let $v:= pdfbox:version()=>trace("VER: ")
- return unit:assert-equals($v,"0.1.0+pdfbox3.0.4")
+ return unit:assert-equals($v,"3.0.4")
};
declare %unit:test
@@ -44,7 +44,7 @@ function test:outline-present(){
declare %unit:test
function test:outline-xml(){
let $pdf:=test:pdf("samples.pdf/icelandic-dictionary.pdf")
- let $outline:=pdfbox:outline($pdf)=>pdfbox:outline-xml()
+ let $outline:=pdfbox:outline-xml($pdf)
return unit:assert-equals(count($outline/bookmark),31)
};
@@ -82,6 +82,13 @@ function test:page-image(){
return unit:assert(true())
};
+declare %unit:test
+function test:pdf-with(){
+ let $path:=test:pdf("samples.pdf/BaseX100.pdf")
+ let $txt:=pdfbox:with-pdf($path,pdfbox:page-text(?,101))
+ return unit:assert-equals($txt,"Options")
+};
+
declare function test:pdf($file as xs:string)
as item(){
file:resolve-path($file,$test:base)=>pdfbox:open()