This commit is contained in:
Andy Bunce 2024-04-08 23:03:36 +01:00
parent 2b131ee71a
commit bd5b7cf006
14 changed files with 249 additions and 39 deletions

View file

@ -3,8 +3,8 @@ xquery version '3.1';
pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library,
requires pdfbox jar on classpath
3.02+ required tested with pdfbox-app-3.0.2.jar
@see https://repository.apache.org/content/groups/snapshots/org/apache/pdfbox/pdfbox-app/3.0.2-SNAPSHOT/
@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/
@see download https://pdfbox.apache.org/download.cgi
@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.2/
:)
module namespace pdfbox="urn:expkg-zone58:pdfbox3";
@ -31,11 +31,11 @@ declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"
declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree";
(:~
@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDDocumentOutline.html
@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.2/org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDDocumentOutline.html
:)
declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline";
declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
(:~
@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDOutlineItem.html
:)
@ -80,26 +80,53 @@ as xs:integer{
PDDocument:getNumberOfPages($doc)
};
(:~ map with document metadata :)
declare function pdfbox:information($doc as item())
as map(*){
let $info:=PDDocument:getDocumentInformation($doc)
return map{
"title": PDDocumentInformation:getTitle($info),
"creator": PDDocumentInformation:getCreator($info),
"producer": PDDocumentInformation:getProducer($info),
"subject": PDDocumentInformation:getSubject($info),
"keywords": PDDocumentInformation:getKeywords($info),
"creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate($info)),
"author": PDDocumentInformation:getAuthor($info)
}
};
(:~ convert date :)
declare
function pdfbox:gregToISO($item as item())
as xs:string{
Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
};
(:~ outline for $doc as map()* :)
declare function pdfbox:outline($doc as item())
as map(*)*{
(# db:wrapjava some #) {
let $bookmark:=
let $outline:=
PDDocument:getDocumentCatalog($doc)
=>PDDocumentCatalog:getDocumentOutline()
=>PDOutlineItem:getFirstChild()
let $bk:=pdfbox:outline($doc,$bookmark)
return $bk
return if(exists($outline))
then pdfbox:outline($doc,PDOutlineItem:getFirstChild($outline))
}
};
(: return bookmark info for children of $outlineItem as seq of maps :)
(:~ return bookmark info for children of $outlineItem as seq of maps :)
declare function pdfbox:outline($doc as item(),$outlineItem as item()?)
as map(*)*
{
let $find:=hof:until(
as map(*)*{
let $find as map(*):=pdfbox:_outline($doc ,$outlineItem)
return map:get($find,"list")
};
(: BaseX bug 10.7? error if inlined in outline :)
declare function pdfbox:_outline($doc as item(),$outlineItem as item()?)
as map(*){
hof:until(
function($output) { empty($output?this) },
function($input ) {
let $bk:= pdfbox:bookmark($input?this,$doc)
@ -112,10 +139,9 @@ as map(*)*
"this": PDOutlineItem:getNextSibling($input?this)}
},
map{"list":(),"this":$outlineItem}
)
return $find?list
)
};
(:~ outline as xml :)
declare function pdfbox:outline-xml($outline as map(*)*)
as element(outline){
element outline {
@ -175,19 +201,26 @@ as xs:string
};
(:~ pageLabel for every page
(:~ pageLabel info
@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples
@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files
:)
declare function pdfbox:getPageLabels($doc as item())
as item()
{
PDDocument:getDocumentCatalog($doc)
=>PDDocumentCatalog:getPageLabels()
};
(:~ pageLabel for every page:)
declare function pdfbox:pageLabels($doc as item())
as xs:string*
{
PDDocument:getDocumentCatalog($doc)
=>PDDocumentCatalog:getPageLabels()
=>PDPageLabels:getLabelsByPageIndices()
};
(: text on $pageNo :)
(:~ return text on $pageNo :)
declare function pdfbox:getText($doc as item(), $pageNo as xs:integer)
as xs:string{
let $tStripper := (# db:wrapjava instance #) {
@ -198,4 +231,19 @@ as xs:string{
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
};
declare function pdfbox:report($pdfpath as xs:string)
as map(*){
let $doc:=pdfbox:open($pdfpath)
return (map{
"file": $pdfpath,
"pages": pdfbox:page-count($doc),
"outline": pdfbox:outline($doc)=>count()
},pdfbox:information($doc)
)=>map:merge()
};
(: @TODO :)
declare function pdfbox:pageAsImage($doc as item(), $pageNo as xs:integer)
as item(){
(: BufferedImage image = pdfRenderer.renderImageWithDPI(i, 200, ImageType.RGB) :)
};

View file

@ -1,6 +1,6 @@
(: test use of pageIndex :)
import module namespace pdfbox="urn:expkg-zone58:pdfbox:3" at "../src/lib/pdfbox3.xqm";
import module namespace pagenos = 'urn:pageno' at "../src/lib/pageno.xqm";
import module namespace pdfbox="urn:expkg-zone58:pdfbox3" at "../lib/pdfbox3.xqm";
declare variable $base:=file:base-dir();
declare function local:go($doc,$pdf as element(pdf)){
let $range:=$pdf/@pages/tokenize(.,"")
@ -11,7 +11,7 @@ declare function local:go($doc,$pdf as element(pdf)){
};
let $src:="257107---Book_File-Web_PDF_9798400691218_486731.pdf"=>file:resolve-path($base)
let $doc:=pdfbox:open($src)
let $labels:= pdfbox:getPageLabels($doc)
let $labels:= pdfbox:pageLabels($doc)
let $pdfs:=doc("pdfs\chunks-docbook.xml")/chunks/pdf
for $pdf in $pdfs
let $range:=$pdf/@pages/tokenize(.,"")

View file

@ -8,15 +8,17 @@ declare variable $samples:= map{
"climate": "data\drop-01d\set\2-6-1\A5579C_1\271989---Book_File-Web_PDF_9798400627484_486728.pdf",
"women": "data\drop-01d\set\2-6-1\A6229C_1\257334---Book_File-Web_PDF_9798216172628_486742.pdf",
"genocide": "data\drop1-pdf\GR2967-TRD\272791---Book_File-Web_PDF_9798400640216_486366.pdf",
"world": "data\drop-01c\gpg-book\2-6\A3506C-TRD\256186---Book_File-Web_PDF_9798216038955_486148.pdf",
"dummy": "lib\abc-clio-dummy.pdf"
"world": "data\drop-01c\gpg-book\2-6\A3506C-TRD\256186---Book_File-Web_PDF_9798216038955_486148.pdf"
};
declare variable $base:= "C:\Users\mrwhe\git\bloomsbury\content-architecture\xquery\ABC-CLIO\data";
(:~ resolve :)
declare variable $PDF:= $samples?women=>file:resolve-path($base);
declare variable $PDF:=
$samples?world=>file:resolve-path($base)
(: "C:\Users\mrwhe\git\expkg-zone58\pdfbox\samples.pdf\icelandic-dictionary.pdf" :)
;
let $doc:=pdfbox:open($PDF)
return pdfbox:outline($doc)=>pdfbox:outline-xml()
(: return pdfbox:extract($doc,"c:\tmp\junk3.pdf",1,pdfbox:page-count($doc)) :)
return pdfbox:information($doc)

69
src/test/test.xqm Normal file
View file

@ -0,0 +1,69 @@
(:~ tests for pdfbox3
:)
module namespace test="urn:expkg-zone58:pdfbox3:tests";
import module namespace pdfbox="urn:expkg-zone58:pdfbox3" at "../lib/pdfbox3.xqm";
declare variable $test:base:=file:base-dir()=>file:parent()=>file:parent();
declare %unit:test
function test:pdfbox-version(){
unit:assert(starts-with(pdfbox:version(),"3.0"))
};
declare %unit:test
function test:page-count(){
let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve()
let $pages:=pdfbox:open($PDF)=>pdfbox:page-count()
return unit:assert-equals($pages,521)
};
declare %unit:test
function test:outline-none(){
let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve()
let $outline:=pdfbox:open($PDF)=>pdfbox:outline()
return unit:assert(empty($outline))
};
declare %unit:test
function test:outline-present(){
let $PDF:="samples.pdf/icelandic-dictionary.pdf"=>test:resolve()
let $outline:=pdfbox:open($PDF)=>pdfbox:outline()
return unit:assert(exists($outline))
};
declare %unit:test
function test:outline-xml(){
let $PDF:="samples.pdf/icelandic-dictionary.pdf"=>test:resolve()
let $outline:=pdfbox:open($PDF)=>pdfbox:outline()=>pdfbox:outline-xml()
return unit:assert-equals(count($outline/bookmark),31)
};
declare %unit:test
function test:pagelabels(){
let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve()
let $labels:=pdfbox:open($PDF)=>pdfbox:pageLabels()
return (
unit:assert($labels[1]="i") ,
unit:assert($labels[27]="1")
)
};
declare %unit:test
function test:save(){
let $dest:=file:create-temp-file("test",".pdf")=>trace("DEST: ")
let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve()
let $outline:=pdfbox:open($PDF)=>pdfbox:extract(2,12,$dest)
return unit:assert(true())
};
declare %unit:test
function test:page-text(){
let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve()
let $text:=pdfbox:open($PDF)=>pdfbox:getText(1)
return unit:assert(starts-with($text,"BaseX Documentation"))
};
declare function test:resolve($file as xs:string){
file:resolve-path($file,$test:base)
};

View file

@ -8,7 +8,7 @@ declare
%rest:path('/pdf/api/sources')
%output:method("json")
%output:json("format=xquery")
function api:apt()
function api:apt() as map(*)
{
let $base:="C:/Users/mrwhe/git/expkg-zone58/pdfbox/data/"
let $d:="1e/"
@ -18,6 +18,7 @@ function api:apt()
"items": array{$f!api:path-info(.)}
}
};
declare function api:path-info($file as xs:string)
as map(*)
{

View file

@ -1,6 +1,13 @@
Uses
* https://github.com/blikblum/slick-router#readme
* https://dev.to/blikblum/slick-router-a-powerful-router-for-web-components-3fck
## Sync
```
cd C:\Users\mrwhe\git\expkg-zone58\pdfbox\src\webapp\pdf
c:\DeltaCopy\rsync -rlptz --progress --exclude=.git --exclude=.vscode . andy@localhost::basexserv/
```
## random html in markdown
<style>table, th, td {
border: 1px solid black;

View file

@ -14,6 +14,11 @@ customElements.define('application-view',
}
}
})
this.addEventListener('load', e => {
const data = e.detail;
notify(JSON.stringify(data.items[0]));
}
)
// Custom function to emit toast notifications
function notify(message, variant = 'primary', icon = 'info-circle', duration = 3000) {
const alert = Object.assign(document.createElement('sl-alert'), {
@ -83,7 +88,6 @@ customElements.define('application-view',
)
customElements.define('home-view',
class HomeView extends withRouterLinks(HTMLElement) {
connectedCallback() {
this.getModel();
}
@ -100,13 +104,13 @@ customElements.define('home-view',
}
renderPosts(data) {
const count = data.count
const shadowRoot = this.attachShadow({ mode: "closed" });
const shadowRoot = this.attachShadow({ mode: "open" });
const div = document.createElement("div", { class: "cards" });
shadowRoot.appendChild(div);
data.items.forEach(item => {
shadowRoot.appendChild(Object.assign(
document.createElement('sl-card'), {
textContent: item.slug
}
))
div.appendChild(Object.assign(
document.createElement('sl-card'), { class: "card", textContent: item.slug })
)
})
}
}
@ -189,7 +193,8 @@ customElements.define('settings-view',
<sl-icon slot="icon" name="gear"></sl-icon>
<strong>Your settings have been updated</strong><br />
Settings will take effect on next login.
</sl-alert>
</sl-alert>
<fetch-json src='/pdf/api/sources'/>
</div>
`
}
@ -213,6 +218,35 @@ customElements.define('profile-view',
customElements.define('profile-index-view',
class ProfileIndexView extends HTMLElement {
connectedCallback() {
this.innerHTML = `
<div class='ProfileIndex'>
<h2>${this.$route.params.user} profile</h2>
</div>
`
}
}
)
customElements.define('cards-panel',
class CardPanel extends HTMLElement {
constructor(){
super();
const template = document.createElement('template');
template.id = 'pool-calculator-template';
template.innerHTML = `
<style>
</style>
<div class="input-section">
<!-- ... -->
</div>
`;
}
connectedCallback() {
this.innerHTML = `
<div class='ProfileIndex'>
@ -222,4 +256,35 @@ customElements.define('profile-index-view',
}
}
)
customElements.define('fetch-json',
class FetchJson extends HTMLElement {
static observedAttributes = ["src", "size"];
connectedCallback() {
this.getModel();
}
getModel() {
const src = this.getAttribute('src')
+ "?" + new URLSearchParams({ foo: 'value', bar: 2, });
return new Promise((res, rej) => {
fetch(src)
.then(data => data.json())
.then((json) => {
this.data=data;
this.renderPosts(json);
res();
})
.catch((error) => rej(error));
})
}
renderPosts(data) {
this.innerHTML = `<span>${this.getAttribute('src')} : ${data.count}</span>`;
this.dispatchEvent(new CustomEvent("load", {
detail: data,
composed: true,
bubbles: true
}));
}
}
)

View file

@ -18,7 +18,6 @@
<link rel="stylesheet" href="animations.css" />
</head>
<body>
</body>

View file

@ -73,3 +73,21 @@ html {
router-outlet > * {
display: block;
}
.card-header {
max-width: 300px;
}
.card-header [slot='header'] {
display: flex;
align-items: center;
justify-content: space-between;
}
.card-header h3 {
margin: 0;
}
.card-header sl-icon-button {
font-size: var(--sl-font-size-medium);
}