diff --git a/README.md b/README.md index 8217a41..e8bb31e 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,6 @@ BaseX (10+) interface to [Pdfbox](https://pdfbox.apache.org/) version 3 * save pdf page as image -## Jars -* fontbox-3.0.2.jar -* pdfbox-3.0.2.jar -* pdfbox-io-3.0.2.jar -* commons-logging-1.3.1.jar +## Build -3.6 mb \ No newline at end of file +Use `scripts/make-fat-jar.xq` to package the required `jar`s and `xqm` files to the `dist` folder. diff --git a/scripts/make-fat-jar.xq b/scripts/make-fat-jar.xq index a1cb737..3c5ed00 100644 --- a/scripts/make-fat-jar.xq +++ b/scripts/make-fat-jar.xq @@ -7,9 +7,7 @@ declare variable $urls := ( "org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar", "commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar" ); -(: Main execution -Main-Class: org.basex.modules.Hello - :) + let $config :=map { "base": file:resolve-path("../",static-base-uri()), "manifest-jar" : "pdfbox-3.0.4.jar", diff --git a/src/Pdfbox3.xqm b/src/Pdfbox3.xqm index a2a9315..961779e 100644 --- a/src/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -10,28 +10,19 @@ module namespace pdfbox="org.expkg_zone58.Pdfbox3"; declare namespace Loader ="java:org.apache.pdfbox.Loader"; declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper"; - -(:~ @javadoc org/apache/pdfbox/pdmodel/PDDocument.html :) declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; -(:~ @javadoc org/apache/pdfbox/multipdf/PageExtractor.html :) declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; -(:~ @javadoc org/apache/pdfbox/pdmodel/PDPageTree.html :) declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; -(:~ -@javadoc org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDDocumentOutline.html -:) declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline"; declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; -(:~ -@javadoc org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDOutlineItem.html -:) + declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; @@ -46,15 +37,19 @@ as xs:string{ (:~ open pdf, returns pdf object :) declare function pdfbox:open($pdfpath as xs:string) as item(){ - Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath)) + try{ + Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath)) +} catch *{ + error(xs:QName("pdfbox:open"),"Failed to open: " || $pdfpath) +} }; (:~ the version of the PDF specification used by $pdf e.g "1.4" -returned as string to avoid rounding issues +returned as string to avoid float rounding issues :) declare function pdfbox:specification($pdf as item()) as xs:string{ - PDDocument:getVersion($pdf)=>string() + PDDocument:getVersion($pdf)=>xs:decimal()=>round(4) }; (:~ save pdf $pdf to $savepath , returns $savepath :) @@ -77,10 +72,23 @@ as xs:integer{ PDDocument:getNumberOfPages($pdf) }; +(:~ render of $pdf page to image +options.format="gif,"png" etc, options.scale= 1 is 72 dpi?? :) +declare function pdfbox:page-image($pdf as item(),$pageNo as xs:integer,$options as map(*)) +as xs:base64Binary{ + let $options:=map:merge(($options,map{"format":"gif","scale":1})) + let $bufferedImage:=PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$options?scale) + let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() + let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage ,$options?format, $bytes) + return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) + =>convert:integers-to-base64() + +}; + (:~ map with document metadata :) -declare function pdfbox:information($doc as item()) +declare function pdfbox:information($pdf as item()) as map(*){ - let $info:=PDDocument:getDocumentInformation($doc) + let $info:=PDDocument:getDocumentInformation($pdf) return map{ "title": PDDocumentInformation:getTitle($info), "creator": PDDocumentInformation:getCreator($info), @@ -92,12 +100,7 @@ as map(*){ } }; - (:~ convert date :) -declare %private -function pdfbox:gregToISO($item as item()) -as xs:string{ - Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() -}; + (:~ outline for $pdf as map()* :) declare function pdfbox:outline($pdf as item()) @@ -167,13 +170,13 @@ as map(*) { map{ "index": PDOutlineItem:findDestinationPage($bookmark,$pdf)=>pdfbox:page-index($pdf), - "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}=>translate("�",""), + "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)} + (:=>translate("�",""), :), "hasChildren": PDOutlineItem:hasChildren($bookmark) } }; - (:~ pageIndex of $page in $pdf :) declare function pdfbox:page-index( $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), @@ -234,35 +237,17 @@ as map(*){ )=>map:merge() }; -(:~ java:bufferedImage for $pageNo using $scale times dpi= 72 -@param $pageNo (ZERO based) -@param $scale 1=72 dpi -@return Java java.awt.image.BufferedImage object +(:~ convert date :) +declare %private +function pdfbox:gregToISO($item as item()) +as xs:string{ + Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() +}; + +(:~ fn:do-until shim for BaseX 9+10 +if fn:do-until not found use hof:until :) -declare function pdfbox:pageBufferedImage($pdf as item(), $pageNo as xs:integer,$scale as xs:float) -as item(){ - PDFRenderer:new($pdf)=>PDFRenderer:renderImage($pageNo,$scale) -}; - -(:~ save bufferedimage to $dest -@param $type = "gif","png" etc:) -declare function pdfbox:imageSave($bufferedImage as item(),$dest as xs:string,$type as xs:string) -as xs:boolean{ - Q{java:javax.imageio.ImageIO}write($bufferedImage , $type, File:new($dest)) -}; - -(:~ return image -@param $type = "gif","png" etc:) -declare function pdfbox:imageBinary($bufferedImage as item(),$type as xs:string) -as xs:base64Binary{ - let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() - let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage , $type, $bytes) - return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) - =>convert:integers-to-base64() -}; - -(:~ fn:do-until shim for BaseX 9+ :) -declare function pdfbox:do-until( +declare %private function pdfbox:do-until( $input as item()*, $action as function(item()*, xs:integer) as item()*, $predicate as function(item()*, xs:integer) as xs:boolean? diff --git a/src/lib/bookpages.xqm b/src/lib/bookpages.xqm deleted file mode 100644 index a4edb34..0000000 --- a/src/lib/bookpages.xqm +++ /dev/null @@ -1,63 +0,0 @@ -xquery version '3.1'; -(:~ describe book page numbers as sequence of ranges, similar to PDF pagelabels -@author quodatum -:) -module namespace bookpages = 'urn:bookpages'; - -(:~ Invisible-xml grammar to parse custom pagelabel representation :) -declare variable $bookpages:grammar:=" -book: pagecount,'#',range,(-',', range)*. -pagecount:['0'-'9']+. -range: s,from?,s,type,s,prefix?,s,offset?. -@from: ['0'-'9']+. { pageIndex } -@type: ['C'|'D'|'R'|'r'|'A'|'a'|'w']. -@prefix: -':',~[',']+. -@offset: -'@',['0'-'9']+. - --s: ([Zs]; #9; #a; #d)*. {Optional whitespace} -"; - -(:~ -page number range in given style -:) -declare function bookpages:span($type as xs:string,$length as xs:integer,$first as xs:integer) -as xs:string*{ -let $r:=$first to $first+$length -return switch ($type) - case "D" return $r!format-integer(.,"1") - case "r" return $r!format-integer(.,"i") - case "R" return $r!format-integer(.,"I") - case "C" return "Cover" - default return $r!format-integer(.,$type) -}; - -(:~ pagelabels from text:) -declare function bookpages:expand($pages as xs:string) -as xs:string*{ - let $x:=bookpages:parse($pages) - let $last:=head($x)=>xs:integer() - return hof:until( - function($m){ empty($m?ranges) or count($m?result)eq $last }, - function($m){ - let $range:=head($m?ranges)=>trace("SS") - let $start:=if($range/@offset)then xs:integer($range/@offset) else 1 - let $end:=($m?ranges[2]/xs:integer(@from)-1) otherwise $last - let $length:=$end -count($m?result)-1 - let $span:=bookpages:span($range/@type,$length,$start) - let $span:=if($range/@prefix)then $span!concat($range/@prefix,.) else $span - return map { - 'ranges': tail($m?ranges), - 'result': ($m?result, $span) - }}, - - (: initial input = grammar ranges :) - map { 'ranges': tail($x) , 'result': () } - )?result -}; - -(:~ parse pagenumber description to xml :) -declare function bookpages:parse($pages as xs:string) -as element(range)*{ - invisible-xml($bookpages:grammar)($pages)/* -}; - diff --git a/src/lib/pdfscrape.xqm b/src/lib/pdfscrape.xqm deleted file mode 100644 index 5088ec2..0000000 --- a/src/lib/pdfscrape.xqm +++ /dev/null @@ -1,84 +0,0 @@ -xquery version '3.1'; -(:~ look for pagenos in pdf text -pdfscrape:page-report($doc )=>pdfscrape:inverted-map() -:) -module namespace pdfscrape = 'urn:pdfscrape'; -import module namespace pdfbox="org.expkg_zone58.Pdfbox3" ; - -(:~ page number regex -@todo last line and roman -1=Number system ( D=decimal, R=Roman) -2=Side L=left,R=right -:) -declare %private variable $pdfscrape:pats:=map{ - "DL": "^([1-9][0-9]*).*", - "DR": ".*[^0-9]([1-9][0-9]*)$", - "RL": "^([ivxlc]+).*", - "RR": ".*[^ivxlc]([ivxlc]+)$" -}; - -(:~ page-reports for all pages :) -declare function pdfscrape:page-report($doc as item()) -as element(page)*{ - let $count:=pdfbox:page-count($doc)=>trace("Pages: ") - return (1 to $count )!pdfscrape:page-report($doc,.) -}; - -(:~ page-report for given page :) -declare function pdfscrape:page-report($doc as item(), $page as xs:integer) -as element(page){ - let $txt:=pdfbox:getText($doc,$page) - let $line1:=substring-before($txt,file:line-separator()) - let $fn:=function($acc,$this){ $acc otherwise pdfscrape:line-report($this,$line1)} - let $found:=map:keys($pdfscrape:pats)=>fold-left( (),$fn) - - return { $found, $line1 } -}; - -(:~ attributes created by matching $style with $line1 or empty :) -declare function pdfscrape:line-report($style as xs:string, $line1 as xs:string) -as attribute(*)*{ - if(matches($line1,$pdfscrape:pats?($style))) - then ( - attribute {"style"} { substring($style,1,1) } ,(: 1st key:) - attribute {"LR"} { substring($style,2,1) } ,(: 2nd key:) - attribute {"number"} { replace($line1,$pdfscrape:pats?($style),"$1") } - ) -}; - -(:~ keys are parsed pageno values are pageindices where found:) -declare function pdfscrape:inverted-map($pages as element(page)*) -as map(*) { - $pages[@number]!map:entry(string(@number),string(@index)) - =>map:merge(map{"duplicates":"combine"}) -}; - -(:~ %match -$l page labels -:) -declare function pdfscrape:score($l as xs:string*, - $report as element(page)*) -{ - let $s:=$report!(if(@number)then string(@number) else "") - let $match:= for-each-pair($l,$s,function($l,$s){if($s eq "")then 0 else if ($s eq $l)then 1 else -1}) -return round(sum($match) div count($l) *100,0) -}; - -(:~ convert roman to integer, zero if invalid -@see https://joewiz.org/2021/05/30/converting-roman-numerals-with-xquery-xslt/ -:) -declare function pdfscrape:decode-roman-numeral($roman-numeral as xs:string) -as xs:integer{ - $roman-numeral => upper-case() => characters() - => for-each(map { "M": 1000, "D": 500, "C": 100, "L": 50, "X": 10, "V": 5, "I": 1 }) - => fold-right([0,0], function($number,$accumulator) { - if ($number lt $accumulator?2) - then [ $accumulator?1 - $number, $number ] - else [ $accumulator?1 + $number, $number ] } ) - => array:head() -}; - -declare function pdfscrape:characters($str as xs:string) -{ - -}; diff --git a/src/test/test.xqm b/src/test/test.xqm index 7b0958b..9a527ec 100644 --- a/src/test/test.xqm +++ b/src/test/test.xqm @@ -17,7 +17,7 @@ declare %unit:test function test:specification(){ let $pdf:=test:pdf("samples.pdf/BaseX100.pdf") let $spec:=pdfbox:specification($pdf) - return unit:assert-equals($spec,0+1.4) + return unit:assert-equals($spec,"1.4") }; declare %unit:test