1
0
Fork 0

[mod] auto build

This commit is contained in:
Andy Bunce 2025-02-08 23:01:54 +00:00
parent 9da83b5e27
commit fe687dd723
13 changed files with 124 additions and 96 deletions

View file

@ -11,7 +11,7 @@ on:
jobs:
test:
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Set up Java
uses: actions/setup-java@v4

View file

@ -8,7 +8,7 @@ on:
jobs:
test:
runs-on: ubuntu-22.04
runs-on: ubuntu-latest
steps:
- name: Set up Java
uses: actions/setup-java@v4

View file

@ -1,21 +0,0 @@
# Pdfbox
A BaseX interface for [Pdfbox](https://pdfbox.apache.org/) version 3.
It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7.
* The Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful.
## Features
* read PDF page count.
* read any PDF outline and return as map(s) or XML.
* read pagelabels.
* read page text.
* save pdf page range to a new pdf.
* save pdf page as an image.
## Build
* `scripts/make-xar.xq` packages the required `jar`s and `xqm` files to a `xar` file in the `dist` folder.
### Action support
The workflow `ci-basex.yaml` builds and tests the package. This can be used as an action on [github](https://github.com/features/actions), or on a local [gitea](https://docs.gitea.com/usage/actions/overview) installation.

View file

@ -1 +0,0 @@
{"cells":[{"kind":1,"language":"markdown","value":"# PDFBox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"It comes with the useful PDF debug tool `java -jar debugger-app-3.0.2.jar`"},{"kind":1,"language":"markdown","value":"## Set up XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace bookpages = 'urn:bookpages' at \"../src/lib/bookpages.xqm\";\r\nimport module namespace pdfscrape = 'urn:pdfscrape' at \"../src/lib/pdfscrape.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};\r\ndeclare variable $PDF:= (: $samples?women=>file:resolve-path($config:data) :)\"C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO\\data\\drop-01e\\set\\2-6-1\\A5690C_1\\257107---Book_File-Web_PDF_9798400691218_486731.pdf\"; \r\n"},{"kind":1,"language":"markdown","value":" ## Check pdfbox version"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":1,"language":"markdown","value":"PDF specification version used by document"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:pdfVersion()"},{"kind":1,"language":"markdown","value":"# Page count for PDF"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:page-count()"},{"kind":1,"language":"markdown","value":"# save range to new pdf"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:extract(2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":1,"language":"markdown","value":"### sequence of maps"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:outline()"},{"kind":1,"language":"markdown","value":"XML"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:outline()=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:pageLabels()"},{"kind":1,"language":"markdown","value":"# getText from page index"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"# Page scraping"},{"kind":1,"language":"markdown","value":"## pdf scrape text analysis"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"## Inverted pageno map"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)=>pdfscrape:inverted-map()"},{"kind":1,"language":"markdown","value":"# Save images"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(99,1)\r\n=>pdfbox:imageSave(\"c:\\tmp\\page3.png\",\"png\")\r\n"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(3,0.25)\r\n=>pdfbox:imageBinary(\"jpg\")"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\n(: let $outline:=pdfbox:outline($doc) :)\r\nlet $count:=pdfbox:page-count($doc)\r\norder by $count \r\nreturn ``[`{$f}`: `{ $count }`]``"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f at $pos in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nreturn pdfbox:open(file:resolve-path($f,$a))\r\n=> pdfbox:pageAsImage(0,0.25)\r\n=> pdfbox:imageSave(``[c:\\tmp\\titles\\p`{$pos}`.gif]``,\"gif\")"}]}

View file

@ -1,8 +1,8 @@
{
"name": "pdfbox",
"version": "0.1.1",
"version": "0.1.3",
"description": "A BaseX interface to Apache Pdfbox version 3",
"main": "Pdfbox.xqm",
"main": "src/Pdfbox3.xqm",
"homepage": "https://github.com/npm/example#readme",
"directories": {
"doc": "docs"
@ -20,6 +20,16 @@
"author": "Andy Bunce",
"license": "Apache-2.0",
"quodatum": {
"random": true
"random": true,
"namespace": "org.expkg_zone58.Pdfbox3",
"main-class": "org.apache.pdfbox.pdmodel.PDDocument",
"download": "jars/",
"maven": [
"org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar",
"org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar",
"org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar",
"commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar"
]
}
}

View file

@ -1,11 +1,19 @@
(:~ build utils for REPO packaging :)
module namespace build = 'urn:quodatum:build1';
(:~ build utils for REPO packaging
(:~ create a flat fat jar from jars in $input-dir
keeping only META-INF from $manifest-jar
:)
module namespace build = 'urn:quodatum:build1';
declare namespace bxpkg='http://www.basex.org/modules/pkg';
declare namespace pkg='http://expath.org/ns/pkg';
(:~ jar compress options :)
declare variable $build:archive-opts:= map { "format" : "zip", "algorithm" : "deflate" };
declare variable $build:base:= file:resolve-path("../",static-base-uri())=>trace("base ");
declare variable $build:PKG:=json:doc(file:resolve-path("package.json",$build:base),map{"format":"xquery"});
(:~ return binary for fat jar from jars in $input-dir
keeping only META-INF from $manifest-jar
:)
declare function build:fatjar-from-folder($input-dir as xs:string,$manifest-jar as xs:string)
as xs:base64Binary {
let $fold :=
@ -54,29 +62,75 @@ as xs:base64Binary{
archive:update($jar,$name,$file)
};
declare function build:xar-create($base as xs:string)
(:~ build basex.xml from package.json :)
declare function build:basex.xml()
as xs:string{
``[<package xmlns="http://www.basex.org/modules/pkg">
`{ build:jars("name")!concat('<jar>',.,'</jar>') }`
<class>`{ $build:PKG?quodatum?main-class }`</class>
</package>
]``
};
(:~ expath-pkg.xml using package.json :)
declare function build:expath-pkg.xml()
as xs:string{
``[<package xmlns="http://expath.org/ns/pkg"
name="`{$build:PKG?quodatum?namespace}`"
abbrev="`{$build:PKG?name}`"
version="`{$build:PKG?version}`"
spec="1.0">
<title>`{$build:PKG?description}`</title>
<dependency processor="basex" name="value"/>
<xquery>
<namespace>`{$build:PKG?quodatum?namespace}`</namespace>
<file>`{$build:PKG?main=>replace("^.*/","")}`</file>
</xquery>
</package>
]``
};
declare function build:xar-create()
as xs:base64Binary{
let $entries:=
build:xar-add(map{},file:resolve-path("jars/",$base),"content/")
=>build:xar-add(file:resolve-path("src/Pdfbox3.xqm",$base),"content/")
=>build:xar-add(file:resolve-path("src/metadata/",$base),"")
build:xar-add(map{},build:jars("content"),build:jars("download")!build:content(.))
=>build:xar-add("content/Pdfbox3.xqm",build:content("src/Pdfbox3.xqm"))
=>build:xar-add("expath-pkg.xml",convert:string-to-base64(build:expath-pkg.xml()))
=>build:xar-add("basex.xml",convert:string-to-base64(build:basex.xml()))
return archive:create($entries?name, $entries?content,$build:archive-opts)
};
(:~ zip data for $dir
:)
declare function build:xar-add($map as map(*),$src as xs:string,$xar-dir as xs:string)
(:~ content as base64Binary of $path :)
declare function build:content($path as xs:string)
as xs:base64Binary{
file:resolve-path($path,$build:base)=>file:read-binary()
};
(:~ add (name,content) pairs to archive data :)
declare function build:xar-add($map as map(*),$xar-path as xs:string*,$content as item()*)
as map(*){
let $names:=if(file:is-dir($src))
then file:list($src)[not(starts-with(.,'.'))]!concat($src,.)
else $src
return map:merge((
$map,
map{"name":$names!concat($xar-dir,file:name(.)),
"content":$names!file:read-binary( .)}
),
map{"duplicates":"combine"}
)
map{"name": ($map?name,$xar-path), "content": ($map?content,$content)}
};
(:~ path to created xar file :)
declare function build:xar-path()
as xs:string{
let $a:=``[dist/pdfbox-`{$build:PKG?version}`.xar]``
return file:resolve-path($a,$build:base)
};
declare function build:jars($style as xs:string)
as xs:string*{
let $src:=$build:PKG?quodatum?maven=>array:flatten()
let $names:= $src!replace(.,"^.*/","")
return switch($style)
case "name" return $names
case "download" return $names!concat($build:PKG?quodatum?download,.)
case "content" return $names!concat("content/",.)
default return $src
};
(:~ download $files from $urls to $destdir:)

View file

@ -2,7 +2,7 @@
XQUERY "make xar.."
RUN make-xar.xq
XQUERY "Repo install.."
REPO INSTALL dist/pdfbox.xar
RUN repo-install.xq
REPO LIST

View file

@ -1,16 +1,8 @@
import module namespace build = 'urn:quodatum:build1' at 'build.xqm';
declare variable $base:= file:resolve-path("../",static-base-uri())=>trace("base ");
declare variable $maven-urls := (
"org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar",
"org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar",
"org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar",
"commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar"
);
let $_:=build:maven-download($maven-urls,$base || "jars/")
let $xar:=build:xar-create($base)
let $output-file := file:resolve-path("dist/pdfbox.xar",$base)
let $_:=build:maven-download($build:PKG?quodatum?maven=>array:flatten(),$build:base || "jars/")
let $xar:=build:xar-create()
let $output-file := build:xar-path()
return (build:write-binary($output-file, $xar),
trace($output-file,"xar: "))

8
scripts/repo-install.xq Normal file
View file

@ -0,0 +1,8 @@
import module namespace build = 'urn:quodatum:build1' at 'build.xqm';
let $output-file := file:resolve-path(`dist/pdfbox-{$build:PKG?version}.xar`,$build:base)
return (
repo:install($output-file),
trace($output-file,"repo: ")
)

View file

@ -28,14 +28,13 @@ declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
declare namespace File ="java:java.io.File";
declare variable $pdfbox:package-version:="0.1.1";
declare variable $pdfbox:package-version:="0.1.2";
(:~ SemVer version of this package
with build metadata for Apache Pdfbox in use e.g. "0.1.0+pdfbox3.0.4"
(:~ version of Apache Pdfbox in use e.g. "3.0.4"
:)
declare function pdfbox:version()
as xs:string{
$pdfbox:package-version ||"+pdfbox" || Q{java:org.apache.pdfbox.util.Version}getVersion()
Q{java:org.apache.pdfbox.util.Version}getVersion()
};
(:~ with-document pattern: open pdf,apply function, close pdf
@ -49,7 +48,7 @@ as item()*{
return try{
$fn($pdf),pdfbox:close($pdf)
} catch *{
pdfbox:close($pdf),error()
pdfbox:close($pdf),fn:error($err:code,$src || " " || $err:description)
}
};
@ -189,14 +188,16 @@ as map(*){
};
(:~ outline as xml :)
declare function pdfbox:outline-xml($outline as map(*)*)
as element(outline){
declare function pdfbox:outline-xml($pdf as item())
as element(outline)?{
element outline {
$outline!pdfbox:bookmark-xml(.)
let $outline:=pdfbox:outline($pdf)
return if(exists($outline))
then <outline>{$outline!pdfbox:bookmark-xml(.)}</outline>
}
};
declare function pdfbox:bookmark-xml($outline as map(*)*)
declare %private function pdfbox:bookmark-xml($outline as map(*)*)
as element(bookmark)*
{
$outline!
@ -208,11 +209,11 @@ as element(bookmark)*
(:~ return bookmark info for children of $outlineItem
@return map like{index:,title:,hasChildren:}
:)
declare function pdfbox:bookmark($bookmark as item(),$pdf as item())
declare %private function pdfbox:bookmark($bookmark as item(),$pdf as item())
as map(*)
{
map{
"index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:page-index($pdf),
"index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:find-page($pdf),
"title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}
(:=>translate("<22>",""), :),
"hasChildren": PDOutlineItem:hasChildren($bookmark)
@ -221,7 +222,7 @@ as map(*)
(:~ pageIndex of $page in $pdf :)
declare function pdfbox:page-index(
declare function pdfbox:find-page(
$page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
$pdf as item())
as item()?
@ -268,8 +269,6 @@ as xs:string{
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
};
(:~ convert date :)
declare %private
function pdfbox:gregToISO($item as item())

View file

@ -1,7 +0,0 @@
<package xmlns="http://www.basex.org/modules/pkg">
<jar>pdfbox-3.0.4.jar</jar>
<jar>pdfbox-io-3.0.4.jar</jar>
<jar>fontbox-3.0.4.jar</jar>
<jar>commons-logging-1.3.4.jar</jar>
<class>org.apache.pdfbox.pdmodel.PDDocument</class>
</package>

View file

@ -1,13 +0,0 @@
<package xmlns="http://expath.org/ns/pkg"
name="org.expkg_zone58.Pdfbox3"
abbrev="pdfbox"
version="0.1.1"
spec="1.0">
<title>BaseX interface to Pdfbox (https://pdfbox.apache.org/) version 3</title>
<dependency processor="basex" name="value"/>
<xquery>
<namespace>org.expkg_zone58.Pdfbox3</namespace>
<file>Pdfbox3.xqm</file>
</xquery>
</package>

View file

@ -10,7 +10,7 @@ declare variable $test:base:=file:base-dir()=>file:parent()=>file:parent();
declare %unit:test
function test:pdfbox-version(){
let $v:= pdfbox:version()=>trace("VER: ")
return unit:assert-equals($v,"0.1.0+pdfbox3.0.4")
return unit:assert-equals($v,"3.0.4")
};
declare %unit:test
@ -44,7 +44,7 @@ function test:outline-present(){
declare %unit:test
function test:outline-xml(){
let $pdf:=test:pdf("samples.pdf/icelandic-dictionary.pdf")
let $outline:=pdfbox:outline($pdf)=>pdfbox:outline-xml()
let $outline:=pdfbox:outline-xml($pdf)
return unit:assert-equals(count($outline/bookmark),31)
};
@ -82,6 +82,13 @@ function test:page-image(){
return unit:assert(true())
};
declare %unit:test
function test:pdf-with(){
let $path:=test:pdf("samples.pdf/BaseX100.pdf")
let $txt:=pdfbox:with-pdf($path,pdfbox:page-text(?,101))
return unit:assert-equals($txt,"Options")
};
declare function test:pdf($file as xs:string)
as item(){
file:resolve-path($file,$test:base)=>pdfbox:open()