diff --git a/README.md b/README.md
index 1faa1cd..a682f5c 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,11 @@
# Pdfbox
A BaseX interface for [Pdfbox](https://pdfbox.apache.org/) version 3.
-It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7
+It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7.
+
+* The Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful.
## Features
* read PDF page count.
-* read any PDF outline and return as maps or XML.
+* read any PDF outline and return as map(s) or XML.
* read pagelabels.
* read page text.
* save pdf page range to a new pdf.
diff --git a/package.json b/package.json
index 9f002a7..ab537f8 100644
--- a/package.json
+++ b/package.json
@@ -1,13 +1,15 @@
{
"name": "pdfbox",
- "version": "1.0.0",
+ "version": "0.1.1",
"description": "A BaseX interface to Apache Pdfbox version 3",
- "main": "index.js",
+ "main": "Pdfbox.xqm",
+ "homepage": "https://github.com/npm/example#readme",
"directories": {
"doc": "docs"
},
"scripts": {
- "test": "%BASEX10%/bin/basex -t src/test"
+ "test": "%BASEX10%/bin/basex -t src/test",
+ "docs": "xqdoca"
},
"keywords": [
"pdf",
@@ -16,5 +18,8 @@
"java"
],
"author": "Andy Bunce",
- "license": "Apache-2.0"
-}
+ "license": "Apache-2.0",
+ "quodatum": {
+ "random": true
+ }
+}
\ No newline at end of file
diff --git a/package.xml b/package.xml
new file mode 100644
index 0000000..d18c01f
--- /dev/null
+++ b/package.xml
@@ -0,0 +1,17 @@
+
+
+ org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar
+
+
+ org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar
+
+
+ org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar
+
+
+ commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar
+
+
\ No newline at end of file
diff --git a/scripts/build.xqm b/scripts/build.xqm
index 08efe98..d7cf5dd 100644
--- a/scripts/build.xqm
+++ b/scripts/build.xqm
@@ -4,6 +4,8 @@ module namespace build = 'urn:quodatum:build1';
(:~ create a flat fat jar from jars in $input-dir
keeping only META-INF from $manifest-jar
:)
+declare variable $build:archive-opts:= map { "format" : "zip", "algorithm" : "deflate" };
+
declare function build:fatjar-from-folder($input-dir as xs:string,$manifest-jar as xs:string)
as xs:base64Binary {
let $fold :=
@@ -17,9 +19,7 @@ function ($res as map (*), $jar as xs:string) {
}
let $res := file:list($input-dir, false(), "*.jar")
=>fold-left( map { }, $fold)
-return
- archive:create($res? name, $res? content,
- map { "format" : "zip", "algorithm" : "deflate" })
+return archive:create($res? name, $res? content,$build:archive-opts)
};
(:~ create a fat jar with lib
@@ -34,8 +34,7 @@ declare function build:fatjar-with-lib($input-dir as xs:string,$manifest-jar as
,$lib)
let $content:=(archive:extract-binary($bin,$name)
,$lib!file:read-binary($input-dir || .))
-return archive:create($name, $content,
- map { "format" : "zip", "algorithm" : "deflate" })
+return archive:create($name, $content,$build:archive-opts)
};
(:~ update-manifest :)
@@ -61,15 +60,13 @@ as xs:base64Binary{
build:xar-add(map{},file:resolve-path("jars/",$base),"content/")
=>build:xar-add(file:resolve-path("src/Pdfbox3.xqm",$base),"content/")
=>build:xar-add(file:resolve-path("src/metadata/",$base),"")
- return archive:create($entries?name, $entries?content,
- map { "format" : "zip", "algorithm" : "deflate" })
+ return archive:create($entries?name, $entries?content,$build:archive-opts)
};
(:~ zip data for $dir
:)
declare function build:xar-add($map as map(*),$src as xs:string,$xar-dir as xs:string)
as map(*){
-let $_:=trace(count($map?name),"size ")
let $names:=if(file:is-dir($src))
then file:list($src)[not(starts-with(.,'.'))]!concat($src,.)
else $src
@@ -95,7 +92,7 @@ as empty-sequence(){
};
(:~ write-binary, creating dir if required :)
-declare function build:write-binary($dest as xs:string,$contents)
+declare function build:write-binary($dest as xs:string,$contents as xs:base64Binary?)
as empty-sequence(){
file:create-dir(file:parent($dest)),
file:write-binary($dest,$contents)
diff --git a/scripts/make-xar.xq b/scripts/make-xar.xq
index 1a2aa9a..8ce21fb 100644
--- a/scripts/make-xar.xq
+++ b/scripts/make-xar.xq
@@ -13,4 +13,4 @@ let $_:=build:maven-download($maven-urls,$base || "jars/")
let $xar:=build:xar-create($base)
let $output-file := file:resolve-path("dist/pdfbox.xar",$base)
return (build:write-binary($output-file, $xar),
- trace($output-file,"zar: "))
+ trace($output-file,"xar: "))
diff --git a/scripts/maven.xqm b/scripts/maven.xqm
new file mode 100644
index 0000000..bf4d10f
--- /dev/null
+++ b/scripts/maven.xqm
@@ -0,0 +1,22 @@
+(:~ maven access
+ :
+ ::)
+module namespace mvn = 'urn:quodatum:maven:1';
+
+
+declare variable $mvn:example :=
+ org.ccil.cowan.tagsoup
+ tagsoup
+ 1.2.1
+;
+
+declare function mvn:url($dep as element(dependency),$ext as xs:string)
+as xs:string {
+
+ string-join(
+ ("https://repo.maven.apache.org/maven2/",
+ string-join($dep/*/string(), "/"),
+ "/",$dep/artifactId, "-", $dep/version, ".",$ext
+ ))
+ };
+
diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm
index a6af956..1eda28c 100644
--- a/src/Pdfbox3.xqm
+++ b/src/Pdfbox3.xqm
@@ -4,8 +4,8 @@ pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library,
requires pdfbox jar on classpath, tested with pdfbox-app-3.0.4.jar
@see download https://pdfbox.apache.org/download.cgi
@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.4/
-
:)
+
module namespace pdfbox="org.expkg_zone58.Pdfbox3";
declare namespace Loader ="java:org.apache.pdfbox.Loader";
@@ -28,21 +28,30 @@ declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
declare namespace File ="java:java.io.File";
+declare variable $pdfbox:package-version:="0.1.1";
+
(:~ SemVer version of this package
-with build metadata for Apacke Pdfbox in use e.g. "0.1.0+pdfbox3.0.4"
+with build metadata for Apache Pdfbox in use e.g. "0.1.0+pdfbox3.0.4"
:)
declare function pdfbox:version()
as xs:string{
- "0.1.0+pdfbox" || Q{java:org.apache.pdfbox.util.Version}getVersion()
+ $pdfbox:package-version ||"+pdfbox" || Q{java:org.apache.pdfbox.util.Version}getVersion()
};
-(: open pdf,apply function, close pdf
-with-document pattern, creates local pdfobject and ensures it is closed
-e.g "path..." => pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
+(:~ with-document pattern: open pdf,apply function, close pdf
+ creates a local pdfobject and ensures it is closed after use
+e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
:)
-declare function pdfbox:with-pdf($src as xs:string,$fn as function(*)*)
+declare function pdfbox:with-pdf($src as xs:string,
+ $fn as function(item())as item()*)
as item()*{
- "@TODO"
+ let $pdf:=pdfbox:open($src)
+ return try{
+ $fn($pdf),pdfbox:close($pdf)
+ } catch *{
+ pdfbox:close($pdf),error()
+ }
+
};
(:~ open pdf, returns pdf object :)
@@ -63,7 +72,7 @@ as xs:string{
PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string()
};
-(:~ save pdf $pdf to $savepath , returns $savepath :)
+(:~ save pdf $pdf to filesystem at $savepath , returns $savepath :)
declare function pdfbox:save($pdf as item(),$savepath as xs:string)
as xs:string{
PDDocument:save($pdf, File:new($savepath)),$savepath
@@ -97,7 +106,7 @@ as xs:base64Binary{
};
(:~ map with document metadata :)
-declare function pdfbox:information($pdf as item())
+declare function pdfbox:metadata($pdf as item())
as map(*){
let $info:=PDDocument:getDocumentInformation($pdf)
return map{
@@ -105,13 +114,36 @@ as map(*){
"creator": PDDocumentInformation:getCreator($info),
"producer": PDDocumentInformation:getProducer($info),
"subject": PDDocumentInformation:getSubject($info),
- "keywords": PDDocumentInformation:getKeywords($info),
- "creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate($info)),
+ "keywords": PDDocumentInformation:getKeywords($info),
+ "creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate($info)),
"author": PDDocumentInformation:getAuthor($info)
}
};
+(:~ summary info as map for $pdfpath :)
+declare function pdfbox:report($pdfpath as xs:string)
+as map(*){
+ let $pdf:=pdfbox:open($pdfpath)
+ return (map{
+ "file": $pdfpath,
+ "pages": pdfbox:page-count($pdf),
+ "hasOutline": pdfbox:hasOutline($pdf),
+ "specification":pdfbox:specification($pdf)
+ },pdfbox:metadata($pdf)
+)=>map:merge()
+};
+
+ (:~ true if $pdf has an outline for $pdf as map()* :)
+declare function pdfbox:hasOutline($pdf as item())
+as xs:boolean{
+ (# db:wrapjava some #) {
+ let $outline:=
+ PDDocument:getDocumentCatalog($pdf)
+ =>PDDocumentCatalog:getDocumentOutline()
+ return exists($outline)
+ }
+};
(:~ outline for $pdf as map()* :)
declare function pdfbox:outline($pdf as item())
@@ -236,17 +268,7 @@ as xs:string{
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
};
-(:~ summary info as map for $pdfpath :)
-declare function pdfbox:report($pdfpath as xs:string)
-as map(*){
- let $doc:=pdfbox:open($pdfpath)
- return (map{
- "file": $pdfpath,
- "pages": pdfbox:page-count($doc),
- "outline": pdfbox:outline($doc)=>count()
- },pdfbox:information($doc)
-)=>map:merge()
-};
+
(:~ convert date :)
declare %private
diff --git a/src/metadata/expath-pkg.xml b/src/metadata/expath-pkg.xml
index 0b18387..82d2c9b 100644
--- a/src/metadata/expath-pkg.xml
+++ b/src/metadata/expath-pkg.xml
@@ -1,7 +1,7 @@
BaseX interface to Pdfbox (https://pdfbox.apache.org/) version 3