[mod] memory usage
This commit is contained in:
parent
d8acb448de
commit
919514f783
9 changed files with 67 additions and 26 deletions
2
.vscode/settings.json
vendored
2
.vscode/settings.json
vendored
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"basexTools.xquery.profile": "basex-10",
|
||||
"basexTools.xquery.showHovers": true,
|
||||
"basexTools.xquery.showHovers": false,
|
||||
|
||||
}
|
|
@ -1,3 +1,8 @@
|
|||
# 0.2.7 2025-02-18
|
||||
* reduce memory use
|
||||
* add open from xs:base64Binary
|
||||
* open opts with password
|
||||
* increase test coverage
|
||||
## 0.2.5 2025-02-17
|
||||
* rename property pages to pageCount
|
||||
* increase test coverage
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "pdfbox",
|
||||
"version": "0.2.5",
|
||||
"version": "0.2.7",
|
||||
"description": "A BaseX interface to Apache Pdfbox version 3",
|
||||
"main": "src/Pdfbox3.xqm",
|
||||
"homepage": "https://github.com/expkg-zone58/pdfbox#readme",
|
||||
|
|
|
@ -20,11 +20,13 @@ The features focus on extracting information from PDFs rather than creation or e
|
|||
* save pdf page range to a new pdf.
|
||||
* save image of rendered pdf page.
|
||||
* open PDF with password
|
||||
* support for xs:base64Binary in function inputs and outputs to support database and store usage.
|
||||
* support for xs:base64Binary in function inputs and outputs to facilitate database and store usage.
|
||||
|
||||
### Not supported:
|
||||
* creating completely new PDFs
|
||||
* Page size information
|
||||
* XMP processing
|
||||
* Form processing
|
||||
|
||||
## Documentation
|
||||
* Function [documentation](doc.md)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
|
||||
(: WIP :)
|
||||
import module namespace build = 'urn:quodatum:build1' at 'build.xqm';
|
||||
|
||||
declare variable $base:= file:resolve-path("../",static-base-uri())=>trace("base ");
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
|
||||
(: build xar:)
|
||||
import module namespace build = 'urn:quodatum:build1' at 'build.xqm';
|
||||
|
||||
|
||||
let $xar:=build:xar-create()
|
||||
let $output-file := build:xar-path()
|
||||
return (build:write-binary($output-file, $xar),
|
||||
|
|
|
@ -15,7 +15,7 @@ as xs:string {
|
|||
|
||||
string-join(
|
||||
("https://repo.maven.apache.org/maven2/",
|
||||
string-join($dep/*/string(), "/"),
|
||||
replace($dep/groupId,'.',"/"),
|
||||
"/",$dep/artifactId, "-", $dep/version, ".",$ext
|
||||
))
|
||||
};
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
|
||||
import module namespace build = 'urn:quodatum:build1' at 'build.xqm';
|
||||
|
||||
let $output-file := file:resolve-path("dist/pdfbox-" || $build:PKG?version ||".xar",$build:base)
|
||||
let $output-file := build:xar-path()
|
||||
return (
|
||||
repo:install($output-file),
|
||||
trace($output-file,"repo: ")
|
||||
|
|
|
@ -16,12 +16,16 @@ declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument";
|
|||
declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog";
|
||||
declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels";
|
||||
declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor";
|
||||
declare namespace PDPage ="org.apache.pdfbox.pdmodel.PDPage";
|
||||
declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree";
|
||||
declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline";
|
||||
declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
|
||||
declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
|
||||
declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
|
||||
declare namespace RandomAccessReadBuffer="java:org.apache.pdfbox.io.RandomAccessReadBuffer";
|
||||
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
|
||||
declare namespace PDRectangle="org.apache.pdfbox.pdmodel.common.PDRectangle";
|
||||
|
||||
declare namespace File ="java:java.io.File";
|
||||
|
||||
|
||||
|
@ -44,20 +48,30 @@ as item()*{
|
|||
|
||||
|
||||
(:~ open pdf using fetch:binary, returns pdf object :)
|
||||
declare function pdfbox:open($pdfpath as xs:string)
|
||||
declare function pdfbox:open($pdfsrc as item())
|
||||
as item(){
|
||||
pdfbox:open($pdfpath, map{})
|
||||
pdfbox:open($pdfsrc, map{})
|
||||
};
|
||||
|
||||
(:~ open pdf using with password option, returns pdf object :)
|
||||
declare function pdfbox:open($pdfpath as xs:string, $opts as map(*))
|
||||
(:~ open pdf from file/url/binary, opts may have password , returns pdf object
|
||||
@param $pdfsrc a fetchable url or a xs:base64Binary
|
||||
@param $opts map {"password":}
|
||||
:)
|
||||
declare function pdfbox:open($pdfsrc as item(), $opts as map(*))
|
||||
as item(){
|
||||
try{
|
||||
if($opts?password)
|
||||
then Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath)),$opts?password)
|
||||
else Loader:loadPDF( RandomAccessReadBuffer:new(fetch:binary($pdfpath)))
|
||||
|
||||
if($pdfsrc instance of xs:base64Binary)
|
||||
then Loader:loadPDF( $pdfsrc,string($opts?password))
|
||||
else if(starts-with($pdfsrc,"http"))
|
||||
then Loader:loadPDF( fetch:binary($pdfsrc),string($opts?password))
|
||||
else Loader:loadPDF(RandomAccessReadBufferedFile:new($pdfsrc),string($opts?password))
|
||||
|
||||
} catch *{
|
||||
error(xs:QName("pdfbox:open"),"Failed to open: " || $pdfpath || " " || $err:description)
|
||||
let $loc:=if($pdfsrc instance of xs:base64Binary)
|
||||
then "xs:base64Binary"
|
||||
else $pdfsrc
|
||||
return error(xs:QName("pdfbox:open"),"Failed PDF load " || $loc || " " || $err:description)
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -99,10 +113,10 @@ as xs:integer{
|
|||
};
|
||||
|
||||
(:~ render of $pdf page to image
|
||||
options.format="gif,"png" etc, options.scale= 1 is 72 dpi?? :)
|
||||
options.format="bmp jpg png gif" etc, options.scale= 1 is 72 dpi?? :)
|
||||
declare function pdfbox:page-image($pdf as item(),$pageNo as xs:integer,$options as map(*))
|
||||
as xs:base64Binary{
|
||||
let $options:=map:merge(($options,map{"format":"gif","scale":1}))
|
||||
let $options:=map:merge(($options,map{"format":"jpg","scale":1}))
|
||||
let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale)
|
||||
let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
|
||||
let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes)
|
||||
|
@ -175,18 +189,31 @@ as map(*){
|
|||
pdfbox:report($pdfpaths,map:keys($pdfbox:property-map))
|
||||
};
|
||||
|
||||
(:~ summary CSV style info for named properties for $pdfpaths :)
|
||||
declare function pdfbox:report($pdfpaths as xs:string*, $properties as xs:string*)
|
||||
(:~ summary CSV style info for named properties for $pdfpaths
|
||||
@see https://docs.basex.org/main/CSV_Functions#xquery
|
||||
:)
|
||||
declare function pdfbox:report($pdfpaths as item()*, $properties as xs:string*)
|
||||
as map(*){
|
||||
map{"names": array{"path",$properties},
|
||||
|
||||
"records": for $path in $pdfpaths
|
||||
let $pdf:=pdfbox:open($path)
|
||||
return fold-left($properties,
|
||||
array{$path},
|
||||
let $name:=if($path instance of xs:base64Binary) then "binary" else $path
|
||||
return try{
|
||||
let $pdf:=pdfbox:open($path)
|
||||
return (fold-left($properties,
|
||||
array{$name},
|
||||
function($result as array(*),$prop as xs:string){
|
||||
array:append($result, string(pdfbox:property($pdf, $prop)))}
|
||||
)
|
||||
), pdfbox:close($pdf)
|
||||
)
|
||||
} catch *{
|
||||
fold-left($properties,
|
||||
array{$name},
|
||||
function($result as array(*),$prop as xs:string){
|
||||
array:append($result, "#ERROR")}
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -318,14 +345,22 @@ as xs:string*
|
|||
};
|
||||
|
||||
(:~ return text on $pageNo :)
|
||||
declare function pdfbox:page-text($doc as item(), $pageNo as xs:integer)
|
||||
declare function pdfbox:page-text($pdf as item(), $pageNo as xs:integer)
|
||||
as xs:string{
|
||||
let $tStripper := (# db:wrapjava instance #) {
|
||||
PDFTextStripper:new()
|
||||
=> PDFTextStripper:setStartPage($pageNo)
|
||||
=> PDFTextStripper:setEndPage($pageNo)
|
||||
}
|
||||
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
|
||||
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$pdf)}
|
||||
};
|
||||
|
||||
(:~ return size of $pageNo zero based :)
|
||||
declare function pdfbox:page-size($pdf as item(), $pageNo as xs:integer)
|
||||
as xs:string{
|
||||
PDDocument:getPage($pdf, $pageNo)
|
||||
=>PDPage:getMediaBox()
|
||||
=>PDRectangle:toString()
|
||||
};
|
||||
|
||||
(:~ version of Apache Pdfbox in use e.g. "3.0.4" :)
|
||||
|
|
Loading…
Add table
Reference in a new issue