[mod] download

This commit is contained in:
Andy Bunce 2025-01-30 16:57:25 +00:00
parent 5bc6f02802
commit a0cfa6d937
5 changed files with 110 additions and 80 deletions

View file

@ -49,10 +49,12 @@ as item(){
Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath))
};
(:~ the version of the PDF specification used by $pdf :)
declare function pdfbox:pdfVersion($pdf as item())
as xs:float{
PDDocument:getVersion($pdf)
(:~ the version of the PDF specification used by $pdf e.g "1.4"
returned as string to avoid rounding issues
:)
declare function pdfbox:specification($pdf as item())
as xs:string{
PDDocument:getVersion($pdf)=>string()
};
(:~ save pdf $pdf to $savepath , returns $savepath :)
@ -97,45 +99,49 @@ as xs:string{
Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
};
(:~ outline for $doc as map()* :)
declare function pdfbox:outline($doc as item())
(:~ outline for $pdf as map()* :)
declare function pdfbox:outline($pdf as item())
as map(*)*{
(# db:wrapjava some #) {
let $outline:=
PDDocument:getDocumentCatalog($doc)
PDDocument:getDocumentCatalog($pdf)
=>PDDocumentCatalog:getDocumentOutline()
return if(exists($outline))
then pdfbox:outline($doc,PDOutlineItem:getFirstChild($outline))
then pdfbox:outline($pdf,PDOutlineItem:getFirstChild($outline))
}
};
(:~ return bookmark info for children of $outlineItem as seq of maps :)
declare function pdfbox:outline($doc as item(),$outlineItem as item()?)
declare function pdfbox:outline($pdf as item(),$outlineItem as item()?)
as map(*)*{
let $find as map(*):=pdfbox:_outline($doc ,$outlineItem)
let $find as map(*):=pdfbox:_outline($pdf ,$outlineItem)
return map:get($find,"list")
};
(: BaseX bug 10.7? error if inlined in outline :)
declare %private function pdfbox:_outline($doc as item(),$outlineItem as item()?)
declare %private function pdfbox:_outline($pdf as item(),$outlineItem as item()?)
as map(*){
hof:until(
function($output) { empty($output?this) },
function($input ) {
let $bk:= pdfbox:bookmark($input?this,$doc)
pdfbox:do-until(
map{"list":(),"this":$outlineItem},
function($input ) {
let $bk:= pdfbox:bookmark($input?this,$pdf)
let $bk:= if($bk?hasChildren)
then let $kids:=pdfbox:outline($doc,PDOutlineItem:getFirstChild($input?this))
then let $kids:=pdfbox:outline($pdf,PDOutlineItem:getFirstChild($input?this))
return map:merge(($bk,map:entry("children",$kids)))
else $bk
return map{
"list": ($input?list, $bk),
"this": PDOutlineItem:getNextSibling($input?this)}
},
map{"list":(),"this":$outlineItem}
)
function($output) { empty($output?this) }
)
};
(:~ outline as xml :)
declare function pdfbox:outline-xml($outline as map(*)*)
as element(outline){
@ -156,27 +162,22 @@ as element(bookmark)*
(:~ return bookmark info for children of $outlineItem
@return map like{index:,title:,hasChildren:}
:)
declare function pdfbox:bookmark($bookmark as item(),$doc as item())
declare function pdfbox:bookmark($bookmark as item(),$pdf as item())
as map(*)
{
map{
"index": PDOutlineItem:findDestinationPage($bookmark,$doc)=>pdfbox:pageIndex($doc),
"index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:page-index($pdf),
"title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}=>translate("<22>",""),
"hasChildren": PDOutlineItem:hasChildren($bookmark)
}
};
declare function pdfbox:outx($page ,$document)
{
let $currentPage := PDOutlineItem:findDestinationPage($page,$document)
let $pageNumber := pdfbox:pageIndex($currentPage,$document)
return $pageNumber
};
(:~ pageIndex of $page in $pdf :)
declare function pdfbox:pageIndex(
declare function pdfbox:page-index(
$page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
$pdf)
$pdf as item())
as item()?
{
if(exists($page))
@ -198,28 +199,20 @@ as xs:string
};
(:~ pageLabel info
(:~ pageLabel for every page
@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples
@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files
:)
declare function pdfbox:getPageLabels($pdf as item())
as item()
{
PDDocument:getDocumentCatalog($pdf)
=>PDDocumentCatalog:getPageLabels()
};
(:~ pageLabel for every page:)
declare function pdfbox:pageLabels($doc as item())
declare function pdfbox:labels($pdf as item())
as xs:string*
{
PDDocument:getDocumentCatalog($doc)
PDDocument:getDocumentCatalog($pdf)
=>PDDocumentCatalog:getPageLabels()
=>PDPageLabels:getLabelsByPageIndices()
};
(:~ return text on $pageNo :)
declare function pdfbox:getText($doc as item(), $pageNo as xs:integer)
declare function pdfbox:page-text($doc as item(), $pageNo as xs:integer)
as xs:string{
let $tStripper := (# db:wrapjava instance #) {
PDFTextStripper:new()
@ -246,9 +239,9 @@ as map(*){
@param $scale 1=72 dpi
@return Java java.awt.image.BufferedImage object
:)
declare function pdfbox:pageBufferedImage($doc as item(), $pageNo as xs:integer,$scale as xs:float)
declare function pdfbox:pageBufferedImage($pdf as item(), $pageNo as xs:integer,$scale as xs:float)
as item(){
PDFRenderer:new($doc)=>PDFRenderer:renderImage($pageNo,$scale)
PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$scale)
};
(:~ save bufferedimage to $dest
@ -266,4 +259,21 @@ as xs:base64Binary{
let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage , $type, $bytes)
return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
=>convert:integers-to-base64()
};
};
(:~ fn:do-until shim for BaseX 9+ :)
declare function pdfbox:do-until(
$input as item()*,
$action as function(item()*, xs:integer) as item()*,
$predicate as function(item()*, xs:integer) as xs:boolean?
) as item()*
{
let $fn:=function-lookup(QName('http://www.w3.org/2005/xpath-functions','do-until'), 3)
return if($fn)
then $fn($input,$action,$predicate)
else let $hof:=function-lookup(QName('http://basex.org/modules/hof','until'), 3)
return if($hof)
then $hof($predicate,$action,$input)
else error(xs:QName('pdfbox:do-until'),"No implementation found")
};

View file

@ -7,39 +7,52 @@ import module namespace pdfbox="org.expkg_zone58.Pdfbox3";
declare variable $test:base:=file:base-dir()=>file:parent()=>file:parent();
declare %unit:test
function test:pdfbox-version(){
let $v:= pdfbox:version()=>trace("VER: ")
return unit:assert-equals($v,"3.0.4")
};
declare %unit:test
function test:specification(){
let $pdf:=test:pdf("samples.pdf/BaseX100.pdf")
let $spec:=pdfbox:specification($pdf)
return unit:assert-equals($spec,0+1.4)
};
declare %unit:test
function test:page-count(){
let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve()
let $pages:=pdfbox:open($PDF)=>pdfbox:page-count()
let $pdf:=test:pdf("samples.pdf/BaseX100.pdf")
let $pages:=pdfbox:page-count($pdf)
return unit:assert-equals($pages,521)
};
declare %unit:test
function test:outline-none(){
let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve()
let $outline:=pdfbox:open($PDF)=>pdfbox:outline()
let $pdf:=test:pdf("samples.pdf/BaseX100.pdf")
let $outline:=pdfbox:outline($pdf)
return unit:assert(empty($outline))
};
declare %unit:test
function test:outline-present(){
let $PDF:="samples.pdf/icelandic-dictionary.pdf"=>test:resolve()
let $outline:=pdfbox:open($PDF)=>pdfbox:outline()
let $pdf:=test:pdf("samples.pdf/icelandic-dictionary.pdf")
let $outline:=pdfbox:outline($pdf)
return unit:assert(exists($outline))
};
declare %unit:test
function test:outline-xml(){
let $PDF:="samples.pdf/icelandic-dictionary.pdf"=>test:resolve()
let $outline:=pdfbox:open($PDF)=>pdfbox:outline()=>pdfbox:outline-xml()
let $pdf:=test:pdf("samples.pdf/icelandic-dictionary.pdf")
let $outline:=pdfbox:outline($pdf)=>pdfbox:outline-xml()
return unit:assert-equals(count($outline/bookmark),31)
};
declare %unit:test
function test:pagelabels(){
let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve()
let $labels:=pdfbox:open($PDF)=>pdfbox:pageLabels()
let $pdf:=test:pdf("samples.pdf/BaseX100.pdf")
let $labels:=pdfbox:labels($pdf)
return (
unit:assert($labels[1]="i") ,
unit:assert($labels[27]="1")
@ -47,20 +60,21 @@ function test:pagelabels(){
};
declare %unit:test
function test:save(){
function test:extract-save(){
let $pdf:=test:pdf("samples.pdf/BaseX100.pdf")
let $dest:=file:create-temp-file("test",".pdf")=>trace("DEST: ")
let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve()
let $outline:=pdfbox:open($PDF)=>pdfbox:extract(2,12,$dest)
let $outline:=pdfbox:extract($pdf,2,12,$dest)
return unit:assert(true())
};
declare %unit:test
function test:page-text(){
let $PDF:="samples.pdf/BaseX100.pdf"=>test:resolve()
let $text:=pdfbox:open($PDF)=>pdfbox:getText(1)
let $pdf:=test:pdf("samples.pdf/BaseX100.pdf")
let $text:=pdfbox:page-text($pdf,1)
return unit:assert(starts-with($text,"BaseX Documentation"))
};
declare function test:resolve($file as xs:string){
file:resolve-path($file,$test:base)
declare function test:pdf($file as xs:string)
as item(){
file:resolve-path($file,$test:base)=>pdfbox:open()
};