[mod] tidy
This commit is contained in:
parent
a0cfa6d937
commit
87c0a1611e
6 changed files with 40 additions and 208 deletions
|
@ -10,10 +10,6 @@ BaseX (10+) interface to [Pdfbox](https://pdfbox.apache.org/) version 3
|
||||||
* save pdf page as image
|
* save pdf page as image
|
||||||
|
|
||||||
|
|
||||||
## Jars
|
## Build
|
||||||
* fontbox-3.0.2.jar
|
|
||||||
* pdfbox-3.0.2.jar
|
|
||||||
* pdfbox-io-3.0.2.jar
|
|
||||||
* commons-logging-1.3.1.jar
|
|
||||||
|
|
||||||
3.6 mb
|
Use `scripts/make-fat-jar.xq` to package the required `jar`s and `xqm` files to the `dist` folder.
|
||||||
|
|
|
@ -7,9 +7,7 @@ declare variable $urls := (
|
||||||
"org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar",
|
"org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar",
|
||||||
"commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar"
|
"commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar"
|
||||||
);
|
);
|
||||||
(: Main execution
|
|
||||||
Main-Class: org.basex.modules.Hello
|
|
||||||
:)
|
|
||||||
let $config :=map {
|
let $config :=map {
|
||||||
"base": file:resolve-path("../",static-base-uri()),
|
"base": file:resolve-path("../",static-base-uri()),
|
||||||
"manifest-jar" : "pdfbox-3.0.4.jar",
|
"manifest-jar" : "pdfbox-3.0.4.jar",
|
||||||
|
|
|
@ -10,28 +10,19 @@ module namespace pdfbox="org.expkg_zone58.Pdfbox3";
|
||||||
|
|
||||||
declare namespace Loader ="java:org.apache.pdfbox.Loader";
|
declare namespace Loader ="java:org.apache.pdfbox.Loader";
|
||||||
declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper";
|
declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper";
|
||||||
|
|
||||||
(:~ @javadoc org/apache/pdfbox/pdmodel/PDDocument.html :)
|
|
||||||
declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument";
|
declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument";
|
||||||
|
|
||||||
declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog";
|
declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog";
|
||||||
declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels";
|
declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels";
|
||||||
|
|
||||||
(:~ @javadoc org/apache/pdfbox/multipdf/PageExtractor.html :)
|
|
||||||
declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor";
|
declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor";
|
||||||
|
|
||||||
(:~ @javadoc org/apache/pdfbox/pdmodel/PDPageTree.html :)
|
|
||||||
declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree";
|
declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree";
|
||||||
|
|
||||||
(:~
|
|
||||||
@javadoc org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDDocumentOutline.html
|
|
||||||
:)
|
|
||||||
declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline";
|
declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline";
|
||||||
|
|
||||||
declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
|
declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
|
||||||
(:~
|
|
||||||
@javadoc org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDOutlineItem.html
|
|
||||||
:)
|
|
||||||
declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
|
declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
|
||||||
declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
|
declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
|
||||||
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
|
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
|
||||||
|
@ -46,15 +37,19 @@ as xs:string{
|
||||||
(:~ open pdf, returns pdf object :)
|
(:~ open pdf, returns pdf object :)
|
||||||
declare function pdfbox:open($pdfpath as xs:string)
|
declare function pdfbox:open($pdfpath as xs:string)
|
||||||
as item(){
|
as item(){
|
||||||
Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath))
|
try{
|
||||||
|
Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath))
|
||||||
|
} catch *{
|
||||||
|
error(xs:QName("pdfbox:open"),"Failed to open: " || $pdfpath)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
(:~ the version of the PDF specification used by $pdf e.g "1.4"
|
(:~ the version of the PDF specification used by $pdf e.g "1.4"
|
||||||
returned as string to avoid rounding issues
|
returned as string to avoid float rounding issues
|
||||||
:)
|
:)
|
||||||
declare function pdfbox:specification($pdf as item())
|
declare function pdfbox:specification($pdf as item())
|
||||||
as xs:string{
|
as xs:string{
|
||||||
PDDocument:getVersion($pdf)=>string()
|
PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)
|
||||||
};
|
};
|
||||||
|
|
||||||
(:~ save pdf $pdf to $savepath , returns $savepath :)
|
(:~ save pdf $pdf to $savepath , returns $savepath :)
|
||||||
|
@ -77,10 +72,23 @@ as xs:integer{
|
||||||
PDDocument:getNumberOfPages($pdf)
|
PDDocument:getNumberOfPages($pdf)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
(:~ render of $pdf page to image
|
||||||
|
options.format="gif,"png" etc, options.scale= 1 is 72 dpi?? :)
|
||||||
|
declare function pdfbox:page-image($pdf as item(),$pageNo as xs:integer,$options as map(*))
|
||||||
|
as xs:base64Binary{
|
||||||
|
let $options:=map:merge(($options,map{"format":"gif","scale":1}))
|
||||||
|
let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale)
|
||||||
|
let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
|
||||||
|
let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes)
|
||||||
|
return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
|
||||||
|
=>convert:integers-to-base64()
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
(:~ map with document metadata :)
|
(:~ map with document metadata :)
|
||||||
declare function pdfbox:information($doc as item())
|
declare function pdfbox:information($pdf as item())
|
||||||
as map(*){
|
as map(*){
|
||||||
let $info:=PDDocument:getDocumentInformation($doc)
|
let $info:=PDDocument:getDocumentInformation($pdf)
|
||||||
return map{
|
return map{
|
||||||
"title": PDDocumentInformation:getTitle($info),
|
"title": PDDocumentInformation:getTitle($info),
|
||||||
"creator": PDDocumentInformation:getCreator($info),
|
"creator": PDDocumentInformation:getCreator($info),
|
||||||
|
@ -92,12 +100,7 @@ as map(*){
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
(:~ convert date :)
|
|
||||||
declare %private
|
|
||||||
function pdfbox:gregToISO($item as item())
|
|
||||||
as xs:string{
|
|
||||||
Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ outline for $pdf as map()* :)
|
(:~ outline for $pdf as map()* :)
|
||||||
declare function pdfbox:outline($pdf as item())
|
declare function pdfbox:outline($pdf as item())
|
||||||
|
@ -167,13 +170,13 @@ as map(*)
|
||||||
{
|
{
|
||||||
map{
|
map{
|
||||||
"index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:page-index($pdf),
|
"index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:page-index($pdf),
|
||||||
"title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}=>translate("<22>",""),
|
"title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}
|
||||||
|
(:=>translate("<22>",""), :),
|
||||||
"hasChildren": PDOutlineItem:hasChildren($bookmark)
|
"hasChildren": PDOutlineItem:hasChildren($bookmark)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
(:~ pageIndex of $page in $pdf :)
|
(:~ pageIndex of $page in $pdf :)
|
||||||
declare function pdfbox:page-index(
|
declare function pdfbox:page-index(
|
||||||
$page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
|
$page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
|
||||||
|
@ -234,35 +237,17 @@ as map(*){
|
||||||
)=>map:merge()
|
)=>map:merge()
|
||||||
};
|
};
|
||||||
|
|
||||||
(:~ java:bufferedImage for $pageNo using $scale times dpi= 72
|
(:~ convert date :)
|
||||||
@param $pageNo (ZERO based)
|
declare %private
|
||||||
@param $scale 1=72 dpi
|
function pdfbox:gregToISO($item as item())
|
||||||
@return Java java.awt.image.BufferedImage object
|
as xs:string{
|
||||||
|
Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
|
||||||
|
};
|
||||||
|
|
||||||
|
(:~ fn:do-until shim for BaseX 9+10
|
||||||
|
if fn:do-until not found use hof:until
|
||||||
:)
|
:)
|
||||||
declare function pdfbox:pageBufferedImage($pdf as item(), $pageNo as xs:integer,$scale as xs:float)
|
declare %private function pdfbox:do-until(
|
||||||
as item(){
|
|
||||||
PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$scale)
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ save bufferedimage to $dest
|
|
||||||
@param $type = "gif","png" etc:)
|
|
||||||
declare function pdfbox:imageSave($bufferedImage as item(),$dest as xs:string,$type as xs:string)
|
|
||||||
as xs:boolean{
|
|
||||||
Q{java:javax.imageio.ImageIO}write($bufferedImage , $type, File:new($dest))
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ return image
|
|
||||||
@param $type = "gif","png" etc:)
|
|
||||||
declare function pdfbox:imageBinary($bufferedImage as item(),$type as xs:string)
|
|
||||||
as xs:base64Binary{
|
|
||||||
let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
|
|
||||||
let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage , $type, $bytes)
|
|
||||||
return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
|
|
||||||
=>convert:integers-to-base64()
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ fn:do-until shim for BaseX 9+ :)
|
|
||||||
declare function pdfbox:do-until(
|
|
||||||
$input as item()*,
|
$input as item()*,
|
||||||
$action as function(item()*, xs:integer) as item()*,
|
$action as function(item()*, xs:integer) as item()*,
|
||||||
$predicate as function(item()*, xs:integer) as xs:boolean?
|
$predicate as function(item()*, xs:integer) as xs:boolean?
|
||||||
|
|
|
@ -1,63 +0,0 @@
|
||||||
xquery version '3.1';
|
|
||||||
(:~ describe book page numbers as sequence of ranges, similar to PDF pagelabels
|
|
||||||
@author quodatum
|
|
||||||
:)
|
|
||||||
module namespace bookpages = 'urn:bookpages';
|
|
||||||
|
|
||||||
(:~ Invisible-xml grammar to parse custom pagelabel representation :)
|
|
||||||
declare variable $bookpages:grammar:="
|
|
||||||
book: pagecount,'#',range,(-',', range)*.
|
|
||||||
pagecount:['0'-'9']+.
|
|
||||||
range: s,from?,s,type,s,prefix?,s,offset?.
|
|
||||||
@from: ['0'-'9']+. { pageIndex }
|
|
||||||
@type: ['C'|'D'|'R'|'r'|'A'|'a'|'w'].
|
|
||||||
@prefix: -':',~[',']+.
|
|
||||||
@offset: -'@',['0'-'9']+.
|
|
||||||
|
|
||||||
-s: ([Zs]; #9; #a; #d)*. {Optional whitespace}
|
|
||||||
";
|
|
||||||
|
|
||||||
(:~
|
|
||||||
page number range in given style
|
|
||||||
:)
|
|
||||||
declare function bookpages:span($type as xs:string,$length as xs:integer,$first as xs:integer)
|
|
||||||
as xs:string*{
|
|
||||||
let $r:=$first to $first+$length
|
|
||||||
return switch ($type)
|
|
||||||
case "D" return $r!format-integer(.,"1")
|
|
||||||
case "r" return $r!format-integer(.,"i")
|
|
||||||
case "R" return $r!format-integer(.,"I")
|
|
||||||
case "C" return "Cover"
|
|
||||||
default return $r!format-integer(.,$type)
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ pagelabels from text:)
|
|
||||||
declare function bookpages:expand($pages as xs:string)
|
|
||||||
as xs:string*{
|
|
||||||
let $x:=bookpages:parse($pages)
|
|
||||||
let $last:=head($x)=>xs:integer()
|
|
||||||
return hof:until(
|
|
||||||
function($m){ empty($m?ranges) or count($m?result)eq $last },
|
|
||||||
function($m){
|
|
||||||
let $range:=head($m?ranges)=>trace("SS")
|
|
||||||
let $start:=if($range/@offset)then xs:integer($range/@offset) else 1
|
|
||||||
let $end:=($m?ranges[2]/xs:integer(@from)-1) otherwise $last
|
|
||||||
let $length:=$end -count($m?result)-1
|
|
||||||
let $span:=bookpages:span($range/@type,$length,$start)
|
|
||||||
let $span:=if($range/@prefix)then $span!concat($range/@prefix,.) else $span
|
|
||||||
return map {
|
|
||||||
'ranges': tail($m?ranges),
|
|
||||||
'result': ($m?result, $span)
|
|
||||||
}},
|
|
||||||
|
|
||||||
(: initial input = grammar ranges :)
|
|
||||||
map { 'ranges': tail($x) , 'result': () }
|
|
||||||
)?result
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ parse pagenumber description to xml :)
|
|
||||||
declare function bookpages:parse($pages as xs:string)
|
|
||||||
as element(range)*{
|
|
||||||
invisible-xml($bookpages:grammar)($pages)/*
|
|
||||||
};
|
|
||||||
|
|
|
@ -1,84 +0,0 @@
|
||||||
xquery version '3.1';
|
|
||||||
(:~ look for pagenos in pdf text
|
|
||||||
pdfscrape:page-report($doc )=>pdfscrape:inverted-map()
|
|
||||||
:)
|
|
||||||
module namespace pdfscrape = 'urn:pdfscrape';
|
|
||||||
import module namespace pdfbox="org.expkg_zone58.Pdfbox3" ;
|
|
||||||
|
|
||||||
(:~ page number regex
|
|
||||||
@todo last line and roman
|
|
||||||
1=Number system ( D=decimal, R=Roman)
|
|
||||||
2=Side L=left,R=right
|
|
||||||
:)
|
|
||||||
declare %private variable $pdfscrape:pats:=map{
|
|
||||||
"DL": "^([1-9][0-9]*).*",
|
|
||||||
"DR": ".*[^0-9]([1-9][0-9]*)$",
|
|
||||||
"RL": "^([ivxlc]+).*",
|
|
||||||
"RR": ".*[^ivxlc]([ivxlc]+)$"
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ page-reports for all pages :)
|
|
||||||
declare function pdfscrape:page-report($doc as item())
|
|
||||||
as element(page)*{
|
|
||||||
let $count:=pdfbox:page-count($doc)=>trace("Pages: ")
|
|
||||||
return (1 to $count )!pdfscrape:page-report($doc,.)
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ page-report for given page :)
|
|
||||||
declare function pdfscrape:page-report($doc as item(), $page as xs:integer)
|
|
||||||
as element(page){
|
|
||||||
let $txt:=pdfbox:getText($doc,$page)
|
|
||||||
let $line1:=substring-before($txt,file:line-separator())
|
|
||||||
let $fn:=function($acc,$this){ $acc otherwise pdfscrape:line-report($this,$line1)}
|
|
||||||
let $found:=map:keys($pdfscrape:pats)=>fold-left( (),$fn)
|
|
||||||
|
|
||||||
return <page index="{ $page }">{ $found, $line1 }</page>
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ attributes created by matching $style with $line1 or empty :)
|
|
||||||
declare function pdfscrape:line-report($style as xs:string, $line1 as xs:string)
|
|
||||||
as attribute(*)*{
|
|
||||||
if(matches($line1,$pdfscrape:pats?($style)))
|
|
||||||
then (
|
|
||||||
attribute {"style"} { substring($style,1,1) } ,(: 1st key:)
|
|
||||||
attribute {"LR"} { substring($style,2,1) } ,(: 2nd key:)
|
|
||||||
attribute {"number"} { replace($line1,$pdfscrape:pats?($style),"$1") }
|
|
||||||
)
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ keys are parsed pageno values are pageindices where found:)
|
|
||||||
declare function pdfscrape:inverted-map($pages as element(page)*)
|
|
||||||
as map(*) {
|
|
||||||
$pages[@number]!map:entry(string(@number),string(@index))
|
|
||||||
=>map:merge(map{"duplicates":"combine"})
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ %match
|
|
||||||
$l page labels
|
|
||||||
:)
|
|
||||||
declare function pdfscrape:score($l as xs:string*,
|
|
||||||
$report as element(page)*)
|
|
||||||
{
|
|
||||||
let $s:=$report!(if(@number)then string(@number) else "")
|
|
||||||
let $match:= for-each-pair($l,$s,function($l,$s){if($s eq "")then 0 else if ($s eq $l)then 1 else -1})
|
|
||||||
return round(sum($match) div count($l) *100,0)
|
|
||||||
};
|
|
||||||
|
|
||||||
(:~ convert roman to integer, zero if invalid
|
|
||||||
@see https://joewiz.org/2021/05/30/converting-roman-numerals-with-xquery-xslt/
|
|
||||||
:)
|
|
||||||
declare function pdfscrape:decode-roman-numeral($roman-numeral as xs:string)
|
|
||||||
as xs:integer{
|
|
||||||
$roman-numeral => upper-case() => characters()
|
|
||||||
=> for-each(map { "M": 1000, "D": 500, "C": 100, "L": 50, "X": 10, "V": 5, "I": 1 })
|
|
||||||
=> fold-right([0,0], function($number,$accumulator) {
|
|
||||||
if ($number lt $accumulator?2)
|
|
||||||
then [ $accumulator?1 - $number, $number ]
|
|
||||||
else [ $accumulator?1 + $number, $number ] } )
|
|
||||||
=> array:head()
|
|
||||||
};
|
|
||||||
|
|
||||||
declare function pdfscrape:characters($str as xs:string)
|
|
||||||
{
|
|
||||||
|
|
||||||
};
|
|
|
@ -17,7 +17,7 @@ declare %unit:test
|
||||||
function test:specification(){
|
function test:specification(){
|
||||||
let $pdf:=test:pdf("samples.pdf/BaseX100.pdf")
|
let $pdf:=test:pdf("samples.pdf/BaseX100.pdf")
|
||||||
let $spec:=pdfbox:specification($pdf)
|
let $spec:=pdfbox:specification($pdf)
|
||||||
return unit:assert-equals($spec,0+1.4)
|
return unit:assert-equals($spec,"1.4")
|
||||||
};
|
};
|
||||||
|
|
||||||
declare %unit:test
|
declare %unit:test
|
||||||
|
|
Loading…
Add table
Reference in a new issue