From 919514f783606ad37b017b2270123699e358216b Mon Sep 17 00:00:00 2001 From: Andy Bunce Date: Tue, 18 Feb 2025 16:20:02 +0000 Subject: [PATCH] [mod] memory usage --- .vscode/settings.json | 2 +- changelog.md | 5 +++ package.json | 2 +- readme.md | 4 ++- scripts/make-fat-jar.xq | 2 +- scripts/make-xar.xq | 3 +- scripts/maven.xqm | 2 +- scripts/repo-install.xq | 2 +- src/Pdfbox3.xqm | 71 ++++++++++++++++++++++++++++++----------- 9 files changed, 67 insertions(+), 26 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index bfb690a..13498c2 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,5 +1,5 @@ { "basexTools.xquery.profile": "basex-10", - "basexTools.xquery.showHovers": true, + "basexTools.xquery.showHovers": false, } \ No newline at end of file diff --git a/changelog.md b/changelog.md index 8c3e87d..2df7f79 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,8 @@ +# 0.2.7 2025-02-18 +* reduce memory use +* add open from xs:base64Binary +* open opts with password +* increase test coverage ## 0.2.5 2025-02-17 * rename property pages to pageCount * increase test coverage diff --git a/package.json b/package.json index 2147234..58a8beb 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdfbox", - "version": "0.2.5", + "version": "0.2.7", "description": "A BaseX interface to Apache Pdfbox version 3", "main": "src/Pdfbox3.xqm", "homepage": "https://github.com/expkg-zone58/pdfbox#readme", diff --git a/readme.md b/readme.md index cacd901..53c084d 100644 --- a/readme.md +++ b/readme.md @@ -20,11 +20,13 @@ The features focus on extracting information from PDFs rather than creation or e * save pdf page range to a new pdf. * save image of rendered pdf page. * open PDF with password -* support for xs:base64Binary in function inputs and outputs to support database and store usage. +* support for xs:base64Binary in function inputs and outputs to facilitate database and store usage. ### Not supported: * creating completely new PDFs * Page size information +* XMP processing +* Form processing ## Documentation * Function [documentation](doc.md) diff --git a/scripts/make-fat-jar.xq b/scripts/make-fat-jar.xq index f023043..b327816 100644 --- a/scripts/make-fat-jar.xq +++ b/scripts/make-fat-jar.xq @@ -1,4 +1,4 @@ - +(: WIP :) import module namespace build = 'urn:quodatum:build1' at 'build.xqm'; declare variable $base:= file:resolve-path("../",static-base-uri())=>trace("base "); diff --git a/scripts/make-xar.xq b/scripts/make-xar.xq index 88aef5e..22e7ffc 100644 --- a/scripts/make-xar.xq +++ b/scripts/make-xar.xq @@ -1,7 +1,6 @@ - +(: build xar:) import module namespace build = 'urn:quodatum:build1' at 'build.xqm'; - let $xar:=build:xar-create() let $output-file := build:xar-path() return (build:write-binary($output-file, $xar), diff --git a/scripts/maven.xqm b/scripts/maven.xqm index bf4d10f..e24d2cb 100644 --- a/scripts/maven.xqm +++ b/scripts/maven.xqm @@ -15,7 +15,7 @@ as xs:string { string-join( ("https://repo.maven.apache.org/maven2/", - string-join($dep/*/string(), "/"), + replace($dep/groupId,'.',"/"), "/",$dep/artifactId, "-", $dep/version, ".",$ext )) }; diff --git a/scripts/repo-install.xq b/scripts/repo-install.xq index 88974de..34c3650 100644 --- a/scripts/repo-install.xq +++ b/scripts/repo-install.xq @@ -1,7 +1,7 @@ import module namespace build = 'urn:quodatum:build1' at 'build.xqm'; -let $output-file := file:resolve-path("dist/pdfbox-" || $build:PKG?version ||".xar",$build:base) +let $output-file := build:xar-path() return ( repo:install($output-file), trace($output-file,"repo: ") diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index 0826f19..081eb19 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -16,12 +16,16 @@ declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; +declare namespace PDPage ="org.apache.pdfbox.pdmodel.PDPage"; declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline"; declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer"; +declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; +declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle"; + declare namespace File ="java:java.io.File"; @@ -44,20 +48,30 @@ as item()*{ (:~ open pdf using fetch:binary, returns pdf object :) -declare function pdfbox:open($pdfpath as xs:string) +declare function pdfbox:open($pdfsrc as item()) as item(){ -pdfbox:open($pdfpath, map{}) +pdfbox:open($pdfsrc, map{}) }; -(:~ open pdf using with password option, returns pdf object :) -declare function pdfbox:open($pdfpath as xs:string, $opts as map(*)) +(:~ open pdf from file/url/binary, opts may have password , returns pdf object +@param $pdfsrc a fetchable url or a xs:base64Binary +@param $opts map {"password":} +:) +declare function pdfbox:open($pdfsrc as item(), $opts as map(*)) as item(){ try{ - if($opts?password) - then Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath)),$opts?password) - else Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath))) + + if($pdfsrc instance of xs:base64Binary) + then Loader:loadPDF( $pdfsrc,string($opts?password)) + else if(starts-with($pdfsrc,"http")) + then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password)) + else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password)) + } catch *{ - error(xs:QName("pdfbox:open"),"Failed to open: " || $pdfpath || " " || $err:description) + let $loc:=if($pdfsrc instance of xs:base64Binary) + then "xs:base64Binary" + else $pdfsrc + return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description) } }; @@ -99,10 +113,10 @@ as xs:integer{ }; (:~ render of $pdf page to image -options.format="gif,"png" etc, options.scale= 1 is 72 dpi?? :) +options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :) declare function pdfbox:page-image($pdf as item(),$pageNo as xs:integer,$options as map(*)) as xs:base64Binary{ - let $options:=map:merge(($options,map{"format":"gif","scale":1})) + let $options:=map:merge(($options,map{"format":"jpg","scale":1})) let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) @@ -175,18 +189,31 @@ as map(*){ pdfbox:report($pdfpaths,map:keys($pdfbox:property-map)) }; -(:~ summary CSV style info for named properties for $pdfpaths :) -declare function pdfbox:report($pdfpaths as xs:string*, $properties as xs:string*) +(:~ summary CSV style info for named properties for $pdfpaths +@see https://docs.basex.org/main/CSV_Functions#xquery +:) +declare function pdfbox:report($pdfpaths as item()*, $properties as xs:string*) as map(*){ map{"names": array{"path",$properties}, "records": for $path in $pdfpaths - let $pdf:=pdfbox:open($path) - return fold-left($properties, - array{$path}, + let $name:=if($path instance of xs:base64Binary) then "binary" else $path + return try{ + let $pdf:=pdfbox:open($path) + return (fold-left($properties, + array{$name}, function($result as array(*),$prop as xs:string){ array:append($result, string(pdfbox:property($pdf, $prop)))} - ) + ), pdfbox:close($pdf) + ) + } catch *{ + fold-left($properties, + array{$name}, + function($result as array(*),$prop as xs:string){ + array:append($result, "#ERROR")} + ) + } + } }; @@ -318,14 +345,22 @@ as xs:string* }; (:~ return text on $pageNo :) -declare function pdfbox:page-text($doc as item(), $pageNo as xs:integer) +declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer) as xs:string{ let $tStripper := (# db:wrapjava instance #) { PDFTextStripper:new() => PDFTextStripper:setStartPage($pageNo) => PDFTextStripper:setEndPage($pageNo) } - return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)} + return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)} +}; + +(:~ return size of $pageNo zero based :) +declare function pdfbox:page-size($pdf as item(), $pageNo as xs:integer) +as xs:string{ + PDDocument:getPage($pdf, $pageNo) + =>PDPage:getMediaBox() + =>PDRectangle:toString() }; (:~ version of Apache Pdfbox in use e.g. "3.0.4" :)