1
0
Fork 0

[mod] memory usage

This commit is contained in:
Andy Bunce 2025-02-18 16:20:02 +00:00
parent d8acb448de
commit 919514f783
9 changed files with 67 additions and 26 deletions

View file

@ -1,5 +1,5 @@
{
"basexTools.xquery.profile": "basex-10",
"basexTools.xquery.showHovers": true,
"basexTools.xquery.showHovers": false,
}

View file

@ -1,3 +1,8 @@
# 0.2.7 2025-02-18
* reduce memory use
* add open from xs:base64Binary
* open opts with password
* increase test coverage
## 0.2.5 2025-02-17
* rename property pages to pageCount
* increase test coverage

View file

@ -1,6 +1,6 @@
{
"name": "pdfbox",
"version": "0.2.5",
"version": "0.2.7",
"description": "A BaseX interface to Apache Pdfbox version 3",
"main": "src/Pdfbox3.xqm",
"homepage": "https://github.com/expkg-zone58/pdfbox#readme",

View file

@ -20,11 +20,13 @@ The features focus on extracting information from PDFs rather than creation or e
* save pdf page range to a new pdf.
* save image of rendered pdf page.
* open PDF with password
* support for xs:base64Binary in function inputs and outputs to support database and store usage.
* support for xs:base64Binary in function inputs and outputs to facilitate database and store usage.
### Not supported:
* creating completely new PDFs
* Page size information
* XMP processing
* Form processing
## Documentation
* Function [documentation](doc.md)

View file

@ -1,4 +1,4 @@
(: WIP :)
import module namespace build = 'urn:quodatum:build1' at 'build.xqm';
declare variable $base:= file:resolve-path("../",static-base-uri())=>trace("base ");

View file

@ -1,7 +1,6 @@
(: build xar:)
import module namespace build = 'urn:quodatum:build1' at 'build.xqm';
let $xar:=build:xar-create()
let $output-file := build:xar-path()
return (build:write-binary($output-file, $xar),

View file

@ -15,7 +15,7 @@ as xs:string {
string-join(
("https://repo.maven.apache.org/maven2/",
string-join($dep/*/string(), "/"),
replace($dep/groupId,'.',"/"),
"/",$dep/artifactId, "-", $dep/version, ".",$ext
))
};

View file

@ -1,7 +1,7 @@
import module namespace build = 'urn:quodatum:build1' at 'build.xqm';
let $output-file := file:resolve-path("dist/pdfbox-" || $build:PKG?version ||".xar",$build:base)
let $output-file := build:xar-path()
return (
repo:install($output-file),
trace($output-file,"repo: ")

View file

@ -16,12 +16,16 @@ declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument";
declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog";
declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels";
declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor";
declare namespace PDPage ="org.apache.pdfbox.pdmodel.PDPage";
declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree";
declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline";
declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer";
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle";
declare namespace File ="java:java.io.File";
@ -44,20 +48,30 @@ as item()*{
(:~ open pdf using fetch:binary, returns pdf object :)
declare function pdfbox:open($pdfpath as xs:string)
declare function pdfbox:open($pdfsrc as item())
as item(){
pdfbox:open($pdfpath, map{})
pdfbox:open($pdfsrc, map{})
};
(:~ open pdf using with password option, returns pdf object :)
declare function pdfbox:open($pdfpath as xs:string, $opts as map(*))
(:~ open pdf from file/url/binary, opts may have password , returns pdf object
@param $pdfsrc a fetchable url or a xs:base64Binary
@param $opts map {"password":}
:)
declare function pdfbox:open($pdfsrc as item(), $opts as map(*))
as item(){
try{
if($opts?password)
then Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath)),$opts?password)
else Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath)))
if($pdfsrc instance of xs:base64Binary)
then Loader:loadPDF( $pdfsrc,string($opts?password))
else if(starts-with($pdfsrc,"http"))
then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password))
else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password))
} catch *{
error(xs:QName("pdfbox:open"),"Failed to open: " || $pdfpath || " " || $err:description)
let $loc:=if($pdfsrc instance of xs:base64Binary)
then "xs:base64Binary"
else $pdfsrc
return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description)
}
};
@ -99,10 +113,10 @@ as xs:integer{
};
(:~ render of $pdf page to image
options.format="gif,"png" etc, options.scale= 1 is 72 dpi?? :)
options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :)
declare function pdfbox:page-image($pdf as item(),$pageNo as xs:integer,$options as map(*))
as xs:base64Binary{
let $options:=map:merge(($options,map{"format":"gif","scale":1}))
let $options:=map:merge(($options,map{"format":"jpg","scale":1}))
let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale)
let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes)
@ -175,18 +189,31 @@ as map(*){
pdfbox:report($pdfpaths,map:keys($pdfbox:property-map))
};
(:~ summary CSV style info for named properties for $pdfpaths :)
declare function pdfbox:report($pdfpaths as xs:string*, $properties as xs:string*)
(:~ summary CSV style info for named properties for $pdfpaths
@see https://docs.basex.org/main/CSV_Functions#xquery
:)
declare function pdfbox:report($pdfpaths as item()*, $properties as xs:string*)
as map(*){
map{"names": array{"path",$properties},
"records": for $path in $pdfpaths
let $pdf:=pdfbox:open($path)
return fold-left($properties,
array{$path},
let $name:=if($path instance of xs:base64Binary) then "binary" else $path
return try{
let $pdf:=pdfbox:open($path)
return (fold-left($properties,
array{$name},
function($result as array(*),$prop as xs:string){
array:append($result, string(pdfbox:property($pdf, $prop)))}
)
), pdfbox:close($pdf)
)
} catch *{
fold-left($properties,
array{$name},
function($result as array(*),$prop as xs:string){
array:append($result, "#ERROR")}
)
}
}
};
@ -318,14 +345,22 @@ as xs:string*
};
(:~ return text on $pageNo :)
declare function pdfbox:page-text($doc as item(), $pageNo as xs:integer)
declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer)
as xs:string{
let $tStripper := (# db:wrapjava instance #) {
PDFTextStripper:new()
=> PDFTextStripper:setStartPage($pageNo)
=> PDFTextStripper:setEndPage($pageNo)
}
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
};
(:~ return size of $pageNo zero based :)
declare function pdfbox:page-size($pdf as item(), $pageNo as xs:integer)
as xs:string{
PDDocument:getPage($pdf, $pageNo)
=>PDPage:getMediaBox()
=>PDRectangle:toString()
};
(:~ version of Apache Pdfbox in use e.g. "3.0.4" :)