1
0
Fork 0

[mod] lost

This commit is contained in:
Andy Bunce 2025-02-04 20:45:45 +00:00
parent 38d12d91c1
commit d37f923d09
8 changed files with 106 additions and 41 deletions

View file

@ -1,9 +1,11 @@
# Pdfbox # Pdfbox
A BaseX interface for [Pdfbox](https://pdfbox.apache.org/) version 3. A BaseX interface for [Pdfbox](https://pdfbox.apache.org/) version 3.
It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7 It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7.
* The Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful.
## Features ## Features
* read PDF page count. * read PDF page count.
* read any PDF outline and return as maps or XML. * read any PDF outline and return as map(s) or XML.
* read pagelabels. * read pagelabels.
* read page text. * read page text.
* save pdf page range to a new pdf. * save pdf page range to a new pdf.

View file

@ -1,13 +1,15 @@
{ {
"name": "pdfbox", "name": "pdfbox",
"version": "1.0.0", "version": "0.1.1",
"description": "A BaseX interface to Apache Pdfbox version 3", "description": "A BaseX interface to Apache Pdfbox version 3",
"main": "index.js", "main": "Pdfbox.xqm",
"homepage": "https://github.com/npm/example#readme",
"directories": { "directories": {
"doc": "docs" "doc": "docs"
}, },
"scripts": { "scripts": {
"test": "%BASEX10%/bin/basex -t src/test" "test": "%BASEX10%/bin/basex -t src/test",
"docs": "xqdoca"
}, },
"keywords": [ "keywords": [
"pdf", "pdf",
@ -16,5 +18,8 @@
"java" "java"
], ],
"author": "Andy Bunce", "author": "Andy Bunce",
"license": "Apache-2.0" "license": "Apache-2.0",
"quodatum": {
"random": true
}
} }

17
package.xml Normal file
View file

@ -0,0 +1,17 @@
<package name="org.expkg_zone58.Pdfbox3"
abbrev="pdfbox"
version="0.1.1"
spec="1.0">
<component name="pdfbox-3.0.4.jar">
<source type="maven">org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar</source>
</component>
<component name="pdfbox-io-3.0.4.jar">
<source type="maven">org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar</source>
</component>
<component name="fontbox-3.0.4.jar">
<source type="maven">org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar</source>
</component>
<component name="commons-logging-1.3.4.jar">
<source type="maven">commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar</source>
</component>
</package>

View file

@ -4,6 +4,8 @@ module namespace build = 'urn:quodatum:build1';
(:~ create a flat fat jar from jars in $input-dir (:~ create a flat fat jar from jars in $input-dir
keeping only META-INF from $manifest-jar keeping only META-INF from $manifest-jar
:) :)
declare variable $build:archive-opts:= map { "format" : "zip", "algorithm" : "deflate" };
declare function build:fatjar-from-folder($input-dir as xs:string,$manifest-jar as xs:string) declare function build:fatjar-from-folder($input-dir as xs:string,$manifest-jar as xs:string)
as xs:base64Binary { as xs:base64Binary {
let $fold := let $fold :=
@ -17,9 +19,7 @@ function ($res as map (*), $jar as xs:string) {
} }
let $res := file:list($input-dir, false(), "*.jar") let $res := file:list($input-dir, false(), "*.jar")
=>fold-left( map { }, $fold) =>fold-left( map { }, $fold)
return return archive:create($res? name, $res? content,$build:archive-opts)
archive:create($res? name, $res? content,
map { "format" : "zip", "algorithm" : "deflate" })
}; };
(:~ create a fat jar with lib (:~ create a fat jar with lib
@ -34,8 +34,7 @@ declare function build:fatjar-with-lib($input-dir as xs:string,$manifest-jar as
,$lib) ,$lib)
let $content:=(archive:extract-binary($bin,$name) let $content:=(archive:extract-binary($bin,$name)
,$lib!file:read-binary($input-dir || .)) ,$lib!file:read-binary($input-dir || .))
return archive:create($name, $content, return archive:create($name, $content,$build:archive-opts)
map { "format" : "zip", "algorithm" : "deflate" })
}; };
(:~ update-manifest :) (:~ update-manifest :)
@ -61,15 +60,13 @@ as xs:base64Binary{
build:xar-add(map{},file:resolve-path("jars/",$base),"content/") build:xar-add(map{},file:resolve-path("jars/",$base),"content/")
=>build:xar-add(file:resolve-path("src/Pdfbox3.xqm",$base),"content/") =>build:xar-add(file:resolve-path("src/Pdfbox3.xqm",$base),"content/")
=>build:xar-add(file:resolve-path("src/metadata/",$base),"") =>build:xar-add(file:resolve-path("src/metadata/",$base),"")
return archive:create($entries?name, $entries?content, return archive:create($entries?name, $entries?content,$build:archive-opts)
map { "format" : "zip", "algorithm" : "deflate" })
}; };
(:~ zip data for $dir (:~ zip data for $dir
:) :)
declare function build:xar-add($map as map(*),$src as xs:string,$xar-dir as xs:string) declare function build:xar-add($map as map(*),$src as xs:string,$xar-dir as xs:string)
as map(*){ as map(*){
let $_:=trace(count($map?name),"size ")
let $names:=if(file:is-dir($src)) let $names:=if(file:is-dir($src))
then file:list($src)[not(starts-with(.,'.'))]!concat($src,.) then file:list($src)[not(starts-with(.,'.'))]!concat($src,.)
else $src else $src
@ -95,7 +92,7 @@ as empty-sequence(){
}; };
(:~ write-binary, creating dir if required :) (:~ write-binary, creating dir if required :)
declare function build:write-binary($dest as xs:string,$contents) declare function build:write-binary($dest as xs:string,$contents as xs:base64Binary?)
as empty-sequence(){ as empty-sequence(){
file:create-dir(file:parent($dest)), file:create-dir(file:parent($dest)),
file:write-binary($dest,$contents) file:write-binary($dest,$contents)

View file

@ -13,4 +13,4 @@ let $_:=build:maven-download($maven-urls,$base || "jars/")
let $xar:=build:xar-create($base) let $xar:=build:xar-create($base)
let $output-file := file:resolve-path("dist/pdfbox.xar",$base) let $output-file := file:resolve-path("dist/pdfbox.xar",$base)
return (build:write-binary($output-file, $xar), return (build:write-binary($output-file, $xar),
trace($output-file,"zar: ")) trace($output-file,"xar: "))

22
scripts/maven.xqm Normal file
View file

@ -0,0 +1,22 @@
(:~ maven access
:
::)
module namespace mvn = 'urn:quodatum:maven:1';
declare variable $mvn:example := <dependency>
<groupId>org.ccil.cowan.tagsoup</groupId>
<artifactId>tagsoup</artifactId>
<version>1.2.1</version>
</dependency>;
declare function mvn:url($dep as element(dependency),$ext as xs:string)
as xs:string {
string-join(
("https://repo.maven.apache.org/maven2/",
string-join($dep/*/string(), "/"),
"/",$dep/artifactId, "-", $dep/version, ".",$ext
))
};

View file

@ -4,8 +4,8 @@ pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library,
requires pdfbox jar on classpath, tested with pdfbox-app-3.0.4.jar requires pdfbox jar on classpath, tested with pdfbox-app-3.0.4.jar
@see download https://pdfbox.apache.org/download.cgi @see download https://pdfbox.apache.org/download.cgi
@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.4/ @javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.4/
:) :)
module namespace pdfbox="org.expkg_zone58.Pdfbox3"; module namespace pdfbox="org.expkg_zone58.Pdfbox3";
declare namespace Loader ="java:org.apache.pdfbox.Loader"; declare namespace Loader ="java:org.apache.pdfbox.Loader";
@ -28,21 +28,30 @@ declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
declare namespace File ="java:java.io.File"; declare namespace File ="java:java.io.File";
declare variable $pdfbox:package-version:="0.1.1";
(:~ SemVer version of this package (:~ SemVer version of this package
with build metadata for Apacke Pdfbox in use e.g. "0.1.0+pdfbox3.0.4" with build metadata for Apache Pdfbox in use e.g. "0.1.0+pdfbox3.0.4"
:) :)
declare function pdfbox:version() declare function pdfbox:version()
as xs:string{ as xs:string{
"0.1.0+pdfbox" || Q{java:org.apache.pdfbox.util.Version}getVersion() $pdfbox:package-version ||"+pdfbox" || Q{java:org.apache.pdfbox.util.Version}getVersion()
}; };
(: open pdf,apply function, close pdf (:~ with-document pattern: open pdf,apply function, close pdf
with-document pattern, creates local pdfobject and ensures it is closed creates a local pdfobject and ensures it is closed after use
e.g "path..." => pdfbox:with-pdf("path...",pdfbox:page-text(?,5)) e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
:) :)
declare function pdfbox:with-pdf($src as xs:string,$fn as function(*)*) declare function pdfbox:with-pdf($src as xs:string,
$fn as function(item())as item()*)
as item()*{ as item()*{
"@TODO" let $pdf:=pdfbox:open($src)
return try{
$fn($pdf),pdfbox:close($pdf)
} catch *{
pdfbox:close($pdf),error()
}
}; };
(:~ open pdf, returns pdf object :) (:~ open pdf, returns pdf object :)
@ -63,7 +72,7 @@ as xs:string{
PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string() PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string()
}; };
(:~ save pdf $pdf to $savepath , returns $savepath :) (:~ save pdf $pdf to filesystem at $savepath , returns $savepath :)
declare function pdfbox:save($pdf as item(),$savepath as xs:string) declare function pdfbox:save($pdf as item(),$savepath as xs:string)
as xs:string{ as xs:string{
PDDocument:save($pdf, File:new($savepath)),$savepath PDDocument:save($pdf, File:new($savepath)),$savepath
@ -97,7 +106,7 @@ as xs:base64Binary{
}; };
(:~ map with document metadata :) (:~ map with document metadata :)
declare function pdfbox:information($pdf as item()) declare function pdfbox:metadata($pdf as item())
as map(*){ as map(*){
let $info:=PDDocument:getDocumentInformation($pdf) let $info:=PDDocument:getDocumentInformation($pdf)
return map{ return map{
@ -111,7 +120,30 @@ as map(*){
} }
}; };
(:~ summary info as map for $pdfpath :)
declare function pdfbox:report($pdfpath as xs:string)
as map(*){
let $pdf:=pdfbox:open($pdfpath)
return (map{
"file": $pdfpath,
"pages": pdfbox:page-count($pdf),
"hasOutline": pdfbox:hasOutline($pdf),
"specification":pdfbox:specification($pdf)
},pdfbox:metadata($pdf)
)=>map:merge()
};
(:~ true if $pdf has an outline for $pdf as map()* :)
declare function pdfbox:hasOutline($pdf as item())
as xs:boolean{
(# db:wrapjava some #) {
let $outline:=
PDDocument:getDocumentCatalog($pdf)
=>PDDocumentCatalog:getDocumentOutline()
return exists($outline)
}
};
(:~ outline for $pdf as map()* :) (:~ outline for $pdf as map()* :)
declare function pdfbox:outline($pdf as item()) declare function pdfbox:outline($pdf as item())
@ -236,17 +268,7 @@ as xs:string{
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)} return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
}; };
(:~ summary info as map for $pdfpath :)
declare function pdfbox:report($pdfpath as xs:string)
as map(*){
let $doc:=pdfbox:open($pdfpath)
return (map{
"file": $pdfpath,
"pages": pdfbox:page-count($doc),
"outline": pdfbox:outline($doc)=>count()
},pdfbox:information($doc)
)=>map:merge()
};
(:~ convert date :) (:~ convert date :)
declare %private declare %private

View file

@ -1,7 +1,7 @@
<package xmlns="http://expath.org/ns/pkg" <package xmlns="http://expath.org/ns/pkg"
name="org.expkg_zone58.Pdfbox3" name="org.expkg_zone58.Pdfbox3"
abbrev="pdfbox" abbrev="pdfbox"
version="0.1.0" version="0.1.1"
spec="1.0"> spec="1.0">
<title>BaseX interface to Pdfbox (https://pdfbox.apache.org/) version 3</title> <title>BaseX interface to Pdfbox (https://pdfbox.apache.org/) version 3</title>