[fix] #1
This commit is contained in:
parent
644de0df32
commit
4ea01764f9
7 changed files with 49 additions and 55 deletions
|
@ -1 +0,0 @@
|
|||
aabcdeggxcd
|
|
@ -6,4 +6,5 @@ Xyxh
|
|||
9.7.4 xyxz01x
|
||||
ab
|
||||
iiiisAasxs
|
||||
x
|
||||
x
|
||||
A
|
|
@ -1,3 +1,6 @@
|
|||
## 0.1.6 2025-02-14
|
||||
* Add `hasLabels`
|
||||
* FIX #1 error if no labels
|
||||
## 0.1.5 2025-02-10
|
||||
* Add `isEncrypted`
|
||||
* Rename `open` to `open-file`
|
5
doc.md
5
doc.md
|
@ -225,7 +225,6 @@ Returns the version of the Apache PDFBox library in use.
|
|||
|
||||
## Notes
|
||||
|
||||
- Ensure that the `pdfbox-app-3.0.4.jar` (or a compatible version) is on the classpath.
|
||||
- The library is designed to work with BaseX 10.7+.
|
||||
- Some functions may throw errors if the PDF is encrypted or if the file cannot be opened.
|
||||
|
||||
|
@ -258,7 +257,3 @@ return pdfbox:metadata($pdf)
|
|||
let $pdf := pdfbox:open-file("path/to/document.pdf")
|
||||
return pdfbox:extract($pdf, 1, 3, "path/to/new/document.pdf")
|
||||
```
|
||||
|
||||
## Conclusion
|
||||
|
||||
The `Pdfbox3.xqm` library provides a powerful interface for working with PDF documents in XQuery. It allows you to extract text, render pages, extract metadata, and more.
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "pdfbox",
|
||||
"version": "0.1.5",
|
||||
"version": "0.1.6",
|
||||
"description": "A BaseX interface to Apache Pdfbox version 3",
|
||||
"main": "src/Pdfbox3.xqm",
|
||||
"homepage": "https://github.com/npm/example#readme",
|
||||
|
|
|
@ -3,13 +3,12 @@ A `BaseX` interface for the `Apache Pdfbox library` version 3.
|
|||
|
||||
The [Apache PDFBox® library](https://pdfbox.apache.org/) is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents.
|
||||
|
||||
This interface is packaged in the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format.
|
||||
This interface is packaged in the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format. The package includes the required Pdfbox jars.
|
||||
A test suite is available and workflow actions run this on BaseX 10.7 and 11.7.
|
||||
|
||||
> [!NOTE]
|
||||
>Currently (v0.1.5) works with BaseX 9.7, but this may change with future versions.
|
||||
|
||||
* The Apache Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful.
|
||||
## Features
|
||||
|
||||
|
||||
|
|
|
@ -92,28 +92,23 @@ as xs:base64Binary{
|
|||
};
|
||||
|
||||
declare variable $pdfbox:doc-info:=map{
|
||||
"title": PDDocumentInformation:getTitle#1,
|
||||
"creator": PDDocumentInformation:getCreator#1,
|
||||
"producer": PDDocumentInformation:getProducer#1,
|
||||
"subject": PDDocumentInformation:getSubject#1,
|
||||
"keywords": PDDocumentInformation:getKeywords#1,
|
||||
"creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate#1),
|
||||
"author": PDDocumentInformation:getAuthor#1
|
||||
"title": PDDocumentInformation:getTitle#1,
|
||||
"author": PDDocumentInformation:getAuthor#1,
|
||||
"creator": PDDocumentInformation:getCreator#1,
|
||||
"producer": PDDocumentInformation:getProducer#1,
|
||||
"subject": PDDocumentInformation:getSubject#1,
|
||||
"keywords": PDDocumentInformation:getKeywords#1,
|
||||
"creationdate": function($i){pdfbox:gregToISO(PDDocumentInformation:getCreationDate($i))},
|
||||
"modificationdate": function($i){pdfbox:gregToISO(PDDocumentInformation:getModificationDate($i))}
|
||||
};
|
||||
|
||||
(:~ map with document metadata :)
|
||||
declare function pdfbox:metadata($pdf as item())
|
||||
as map(*){
|
||||
let $info:=PDDocument:getDocumentInformation($pdf)
|
||||
return map{
|
||||
"title": PDDocumentInformation:getTitle($info),
|
||||
"creator": PDDocumentInformation:getCreator($info),
|
||||
"producer": PDDocumentInformation:getProducer($info),
|
||||
"subject": PDDocumentInformation:getSubject($info),
|
||||
"keywords": PDDocumentInformation:getKeywords($info),
|
||||
"creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate($info)),
|
||||
"author": PDDocumentInformation:getAuthor($info)
|
||||
}
|
||||
return map:for-each($pdfbox:doc-info,
|
||||
function($k,$v){map:entry($k,$pdfbox:doc-info($k)($info))})
|
||||
=>map:merge()
|
||||
};
|
||||
|
||||
(:~ summary info as map for $pdfpath :)
|
||||
|
@ -124,27 +119,26 @@ as map(*){
|
|||
"file": $pdfpath,
|
||||
"pages": pdfbox:page-count($pdf),
|
||||
"hasOutline": pdfbox:hasOutline($pdf),
|
||||
"hasLabels": pdfbox:hasLabels($pdf),
|
||||
"specification":pdfbox:specification($pdf)
|
||||
},pdfbox:metadata($pdf)
|
||||
)=>map:merge()
|
||||
};
|
||||
|
||||
(:~ true if $pdf has an outline for $pdf as map()* :)
|
||||
(:~ true if $pdf has an outline :)
|
||||
declare function pdfbox:hasOutline($pdf as item())
|
||||
as xs:boolean{
|
||||
(# db:wrapjava some #) {
|
||||
let $outline:=
|
||||
PDDocument:getDocumentCatalog($pdf)
|
||||
=>PDDocumentCatalog:getDocumentOutline()
|
||||
|
||||
return exists($outline)
|
||||
}
|
||||
PDDocument:getDocumentCatalog($pdf)
|
||||
=>PDDocumentCatalog:getDocumentOutline()
|
||||
=>exists()
|
||||
};
|
||||
|
||||
(:~ true if $pdf is encrypted* :)
|
||||
declare function pdfbox:isEncrypted($pdf as item())
|
||||
(:~ true if $pdf has Labels :)
|
||||
declare function pdfbox:hasLabels($pdf as item())
|
||||
as xs:boolean{
|
||||
PDDocument:isEncrypted($pdf)
|
||||
PDDocument:getDocumentCatalog($pdf)
|
||||
=>PDDocumentCatalog:getPageLabels()
|
||||
=>exists()
|
||||
};
|
||||
|
||||
(:~ outline for $pdf as map()* :)
|
||||
|
@ -162,7 +156,6 @@ as map(*)*{
|
|||
|
||||
(:~ return bookmark info for children of $outlineItem as seq of maps :)
|
||||
declare function pdfbox:outline($pdf as item(),$outlineItem as item()?)
|
||||
|
||||
as map(*)*{
|
||||
let $find as map(*):=pdfbox:_outline($pdf ,$outlineItem)
|
||||
return map:get($find,"list")
|
||||
|
@ -176,15 +169,15 @@ as map(*){
|
|||
map{"list":(),"this":$outlineItem},
|
||||
|
||||
function($input,$pos ) {
|
||||
let $bk:= pdfbox:bookmark($input?this,$pdf)
|
||||
let $bk:= if($bk?hasChildren)
|
||||
then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this))
|
||||
return map:merge(($bk,map:entry("children",$kids)))
|
||||
else $bk
|
||||
return map{
|
||||
"list": ($input?list, $bk),
|
||||
"this": PDOutlineItem:getNextSibling($input?this)}
|
||||
},
|
||||
let $bk:= pdfbox:bookmark($input?this,$pdf)
|
||||
let $bk:= if($bk?hasChildren)
|
||||
then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this))
|
||||
return map:merge(($bk,map:entry("children",$kids)))
|
||||
else $bk
|
||||
return map{
|
||||
"list": ($input?list, $bk),
|
||||
"this": PDOutlineItem:getNextSibling($input?this)}
|
||||
},
|
||||
|
||||
function($output,$pos) { empty($output?this) }
|
||||
)
|
||||
|
@ -248,16 +241,18 @@ as xs:string
|
|||
};
|
||||
|
||||
|
||||
(:~ pageLabel for every page
|
||||
(:~ pageLabel for every page or empty if none
|
||||
@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples
|
||||
@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files
|
||||
:)
|
||||
declare function pdfbox:labels($pdf as item())
|
||||
as xs:string*
|
||||
{
|
||||
PDDocument:getDocumentCatalog($pdf)
|
||||
=>PDDocumentCatalog:getPageLabels()
|
||||
=>PDPageLabels:getLabelsByPageIndices()
|
||||
let $pagelabels:=PDDocument:getDocumentCatalog($pdf)
|
||||
=>PDDocumentCatalog:getPageLabels()
|
||||
return if(exists($pagelabels))
|
||||
then PDPageLabels:getLabelsByPageIndices($pagelabels)
|
||||
else ()
|
||||
};
|
||||
|
||||
(:~ return text on $pageNo :)
|
||||
|
@ -279,9 +274,11 @@ as xs:string{
|
|||
|
||||
(:~ convert date :)
|
||||
declare %private
|
||||
function pdfbox:gregToISO($item as item())
|
||||
as xs:string{
|
||||
Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
|
||||
function pdfbox:gregToISO($item as item()?)
|
||||
as xs:string?{
|
||||
if(exists($item))
|
||||
then Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
|
||||
else ()
|
||||
};
|
||||
|
||||
(:~ fn:do-until shim for BaseX 9+10
|
||||
|
@ -299,6 +296,6 @@ declare %private function pdfbox:do-until(
|
|||
else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3)
|
||||
return if(exists($hof))
|
||||
then $hof($predicate(?,0),$action(?,0),$input)
|
||||
else error(xs:QName('pdfbox:do-until'),"No implementation found")
|
||||
else error(xs:QName('pdfbox:do-until'),"No implementation do-until found")
|
||||
|
||||
};
|
||||
|
|
Loading…
Add table
Reference in a new issue