diff --git a/LICENSE b/LICENSE index 261eeb9..b90bd6c 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2024 Andy Bunce Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/docs/pdfbox.xqbk b/docs/pdfbox.xqbk new file mode 100644 index 0000000..95f82c3 --- /dev/null +++ b/docs/pdfbox.xqbk @@ -0,0 +1 @@ +{"cells":[{"kind":1,"language":"markdown","value":"# pdfbox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"Comes with debug tool\r\n```\r\njava -jar debugger-app-3.0.1.jar\r\n```"},{"kind":1,"language":"markdown","value":"## Set up a XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox:3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};"},{"kind":1,"language":"markdown","value":"## page count"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-count($doc)"},{"kind":1,"language":"markdown","value":"## save range to new pdf"},{"kind":2,"language":"xquery","value":"(:~ use full path :)\r\ndeclare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:extract($doc,2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?climate=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:outline($doc)=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?climate=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getPageLabels($doc)"},{"kind":1,"language":"markdown","value":"## page text"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"## PageNo text analysis"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"# Inverted pageno map"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-map($doc)"}]} \ No newline at end of file diff --git a/src/lib/pageno.xqm b/src/lib/pageno.xqm new file mode 100644 index 0000000..7626b3f --- /dev/null +++ b/src/lib/pageno.xqm @@ -0,0 +1,74 @@ +xquery version '3.1'; +(:~ look for pagenos in pdf text +pagenos:page-report($doc )=>pagenos:inverted-map() +:) +module namespace pagenos = 'urn:pageno'; +import module namespace pdfbox="urn:expkg-zone58:pdfbox:3" at "pdfbox3.xqm"; + +(: look for possible page number in first/last line of page text +@todo last line and roman +1=Number system ( D=decimal, R=Roman) +2=Side L=left,R=right +:) +declare variable $pagenos:pats:=map{ + "DL": "^([1-9][0-9]*).*", + "DR": ".*[^0-9]([1-9][0-9]*)$", + "RL": "^([ivxc]+).*", + "RR": ".*[^ivxc]([ivxc]+)$" +}; + +(: page-reports for all pages :) +declare function pagenos:page-report($doc as item()) +as element(page)*{ + let $count:=pdfbox:page-count($doc)=>trace("Pages: ") + return (0 to $count -1)!pagenos:page-report($doc,.) +}; + +(: page-report for given page :) +declare function pagenos:page-report($doc as item(), $page as xs:integer) +as element(page){ + let $txt:=pdfbox:getText($doc,$page) + let $line1:=substring-before($txt,file:line-separator()) + let $fn:=function($acc,$this){ $acc otherwise pagenos:line-report($this,$line1)} + let $found:=map:keys($pagenos:pats)=>fold-left( (),$fn) + + return { $found, $line1 } +}; + +(: empty or attributes created by matching $style with $line1 :) +declare function pagenos:line-report($style as xs:string, $line1 as xs:string) +as attribute(*)*{ + if(matches($line1,$pagenos:pats?($style))) + then ( + attribute {"style"} { substring($style,1,1) } ,(: 1st key:) + attribute {"LR"} { substring($style,2,1) } ,(: 2nd key:) + attribute {"number"} { replace($line1,$pagenos:pats?($style),"$1") } + ) +}; + +(:~ keys are parsed pageno values are pageindices where found:) +declare function pagenos:inverted-map($pages as element(page)*) +as map(*) { + $pages[@number]!map:entry(string(@number),string(@index)) + =>map:merge(map{"duplicates":"combine"}) +}; + +(:~ convert roman to integer, zero if invalid +@see https://joewiz.org/2021/05/30/converting-roman-numerals-with-xquery-xslt/ +:) +declare function pagenos:decode-roman-numeral($roman-numeral as xs:string) +as xs:integer{ + $roman-numeral => upper-case() => pagenos:characters() + => for-each(map { "M": 1000, "D": 500, "C": 100, "L": 50, "X": 10, "V": 5, "I": 1 }) + => fold-right([0,0], function($number,$accumulator) { + if ($number lt $accumulator?2) + then [ $accumulator?1 - $number, $number ] + else [ $accumulator?1 + $number, $number ] } ) + => array:head() +}; + +(:~ xpath 4:) +declare function pagenos:characters($value as xs:string?) +as xs:string*{ + fn:string-to-codepoints($value) ! fn:codepoints-to-string(.) +}; \ No newline at end of file diff --git a/xquery/lib/pdfbox3.xqm b/src/lib/pdfbox3.xqm similarity index 83% rename from xquery/lib/pdfbox3.xqm rename to src/lib/pdfbox3.xqm index 50f4e81..f33bcb8 100644 --- a/xquery/lib/pdfbox3.xqm +++ b/src/lib/pdfbox3.xqm @@ -1,6 +1,6 @@ xquery version '3.1'; (:~ -pdfbox 3.0 https://pdfbox.apache.org/ interface library, +pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10+ interface library, requires pdfbox jar on classpath 3.02 required tested with pdfbox-app-3.0.2-20240121.184204-66.jar @see https://lists.apache.org/list?users@pdfbox.apache.org:lte=1M:loader @@ -52,8 +52,11 @@ as xs:string{ PDDocument:save($doc,File:new($savepath)),$savepath }; -declare function pdfbox:close($doc){ - PDDocument:close($doc) +declare function pdfbox:close($doc) +as empty-sequence(){ + (# db:wrapjava void #) { + PDDocument:close($doc) + } }; declare function pdfbox:page-count($doc as item()) @@ -69,15 +72,15 @@ as map(*)*{ let $bookmark:= PDDocument:getDocumentCatalog($doc) =>PDDocumentCatalog:getDocumentOutline() - =>PDOutlineItem:getFirstChild()=>trace("cur") + =>PDOutlineItem:getFirstChild() - let $bk:=pdfbox:outline($bookmark ,$doc) + let $bk:=pdfbox:outline($doc,$bookmark) return $bk } }; (: return bookmark info for children of $outlineItem :) -declare function pdfbox:outline($outlineItem,$doc ) +declare function pdfbox:outline($doc,$outlineItem ) as map(*)* { let $find:=hof:until( @@ -85,7 +88,7 @@ as map(*)* function($input ) { let $bk:= pdfbox:bookmark($input?this,$doc) let $bk:= if($bk?hasChildren) - then let $kids:=pdfbox:outline(PDOutlineItem:getFirstChild($input?this), $doc) + then let $kids:=pdfbox:outline($doc,PDOutlineItem:getFirstChild($input?this)) return map:merge(($bk,map:entry("children",$kids))) else $bk return map{ @@ -97,16 +100,22 @@ as map(*)* return $find?list }; -declare function pdfbox:outline-XML($outline as map(*)*) -as element(*){ +declare function pdfbox:outline-xml($outline as map(*)*) +as element(outline){ element outline { - for $bookmark in $outline - return - {$bookmark?children!pdfbox:outline-XML(.)} - + $outline!pdfbox:bookmark-xml(.) } }; +declare function pdfbox:bookmark-xml($outline as map(*)*) +as element(bookmark)* +{ + $outline! + + {?children!pdfbox:bookmark-xml(.)} + +}; + (: return bookmark info for children of $outlineItem :) declare function pdfbox:bookmark($bookmark as item(),$doc as item()) as map(*){ @@ -135,13 +144,14 @@ declare function pdfbox:pageIndex( -(:~ new PDF doc from 1 based page range :) -declare function pdfbox:extract($doc as item(),$target as xs:string, - $start as xs:integer,$end as xs:integer) +(:~ new PDF doc from 1 based page range +@return save path :) +declare function pdfbox:extract($doc as item(), + $start as xs:integer,$end as xs:integer,$target as xs:string) +as xs:string { let $a:=PageExtractor:new($doc, $start, $end) =>PageExtractor:extract() - let $map:=pdfbox:save($a,$target) - return pdfbox:close($a) + return (pdfbox:save($a,$target),pdfbox:close($a)) }; @@ -154,9 +164,6 @@ as item()*{ =>PDDocumentCatalog:getPageLabels() =>PDPageLabels:getLabelsByPageIndices() }; -(:~ @TODO -@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files -:) (: text on $pageNo :) declare function pdfbox:getText($doc as item(), $pageNo as xs:integer) diff --git a/xquery/scratch/pdfbox.xq b/src/scratch/pdfbox.xq similarity index 87% rename from xquery/scratch/pdfbox.xq rename to src/scratch/pdfbox.xq index ff7ac97..747aae5 100644 --- a/xquery/scratch/pdfbox.xq +++ b/src/scratch/pdfbox.xq @@ -13,10 +13,10 @@ declare variable $samples:= map{ }; declare variable $base:= "C:\Users\mrwhe\git\bloomsbury\content-architecture\xquery\ABC-CLIO\data"; (:~ resolve :) -declare variable $PDF:= $samples?climate=>file:resolve-path($base); +declare variable $PDF:= $samples?women=>file:resolve-path($base); let $doc:=pdfbox:open($PDF) -return pdfbox:outline($doc)=>pdfbox:outline-XML() +return pdfbox:outline($doc)=>pdfbox:outline-xml() (: return pdfbox:extract($doc,"c:\tmp\junk3.pdf",1,pdfbox:page-count($doc)) :) \ No newline at end of file