1
0

[add] notebook

This commit is contained in:
Andy Bunce 2024-02-28 23:11:09 +00:00
parent e1c74c9608
commit 8ea1a343de
5 changed files with 106 additions and 24 deletions

View File

@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier same "printed page" as the copyright notice for easier
identification within third-party archives. identification within third-party archives.
Copyright [yyyy] [name of copyright owner] Copyright 2024 Andy Bunce
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.

1
docs/pdfbox.xqbk Normal file
View File

@ -0,0 +1 @@
{"cells":[{"kind":1,"language":"markdown","value":"# pdfbox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"Comes with debug tool\r\n```\r\njava -jar debugger-app-3.0.1.jar\r\n```"},{"kind":1,"language":"markdown","value":"## Set up a XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox:3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};"},{"kind":1,"language":"markdown","value":"## page count"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-count($doc)"},{"kind":1,"language":"markdown","value":"## save range to new pdf"},{"kind":2,"language":"xquery","value":"(:~ use full path :)\r\ndeclare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:extract($doc,2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?climate=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:outline($doc)=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?climate=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getPageLabels($doc)"},{"kind":1,"language":"markdown","value":"## page text"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"## PageNo text analysis"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"# Inverted pageno map"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-map($doc)"}]}

74
src/lib/pageno.xqm Normal file
View File

@ -0,0 +1,74 @@
xquery version '3.1';
(:~ look for pagenos in pdf text
pagenos:page-report($doc )=>pagenos:inverted-map()
:)
module namespace pagenos = 'urn:pageno';
import module namespace pdfbox="urn:expkg-zone58:pdfbox:3" at "pdfbox3.xqm";
(: look for possible page number in first/last line of page text
@todo last line and roman
1=Number system ( D=decimal, R=Roman)
2=Side L=left,R=right
:)
declare variable $pagenos:pats:=map{
"DL": "^([1-9][0-9]*).*",
"DR": ".*[^0-9]([1-9][0-9]*)$",
"RL": "^([ivxc]+).*",
"RR": ".*[^ivxc]([ivxc]+)$"
};
(: page-reports for all pages :)
declare function pagenos:page-report($doc as item())
as element(page)*{
let $count:=pdfbox:page-count($doc)=>trace("Pages: ")
return (0 to $count -1)!pagenos:page-report($doc,.)
};
(: page-report for given page :)
declare function pagenos:page-report($doc as item(), $page as xs:integer)
as element(page){
let $txt:=pdfbox:getText($doc,$page)
let $line1:=substring-before($txt,file:line-separator())
let $fn:=function($acc,$this){ $acc otherwise pagenos:line-report($this,$line1)}
let $found:=map:keys($pagenos:pats)=>fold-left( (),$fn)
return <page index="{ $page }">{ $found, $line1 }</page>
};
(: empty or attributes created by matching $style with $line1 :)
declare function pagenos:line-report($style as xs:string, $line1 as xs:string)
as attribute(*)*{
if(matches($line1,$pagenos:pats?($style)))
then (
attribute {"style"} { substring($style,1,1) } ,(: 1st key:)
attribute {"LR"} { substring($style,2,1) } ,(: 2nd key:)
attribute {"number"} { replace($line1,$pagenos:pats?($style),"$1") }
)
};
(:~ keys are parsed pageno values are pageindices where found:)
declare function pagenos:inverted-map($pages as element(page)*)
as map(*) {
$pages[@number]!map:entry(string(@number),string(@index))
=>map:merge(map{"duplicates":"combine"})
};
(:~ convert roman to integer, zero if invalid
@see https://joewiz.org/2021/05/30/converting-roman-numerals-with-xquery-xslt/
:)
declare function pagenos:decode-roman-numeral($roman-numeral as xs:string)
as xs:integer{
$roman-numeral => upper-case() => pagenos:characters()
=> for-each(map { "M": 1000, "D": 500, "C": 100, "L": 50, "X": 10, "V": 5, "I": 1 })
=> fold-right([0,0], function($number,$accumulator) {
if ($number lt $accumulator?2)
then [ $accumulator?1 - $number, $number ]
else [ $accumulator?1 + $number, $number ] } )
=> array:head()
};
(:~ xpath 4:)
declare function pagenos:characters($value as xs:string?)
as xs:string*{
fn:string-to-codepoints($value) ! fn:codepoints-to-string(.)
};

View File

@ -1,6 +1,6 @@
xquery version '3.1'; xquery version '3.1';
(:~ (:~
pdfbox 3.0 https://pdfbox.apache.org/ interface library, pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10+ interface library,
requires pdfbox jar on classpath requires pdfbox jar on classpath
3.02 required tested with pdfbox-app-3.0.2-20240121.184204-66.jar 3.02 required tested with pdfbox-app-3.0.2-20240121.184204-66.jar
@see https://lists.apache.org/list?users@pdfbox.apache.org:lte=1M:loader @see https://lists.apache.org/list?users@pdfbox.apache.org:lte=1M:loader
@ -52,8 +52,11 @@ as xs:string{
PDDocument:save($doc,File:new($savepath)),$savepath PDDocument:save($doc,File:new($savepath)),$savepath
}; };
declare function pdfbox:close($doc){ declare function pdfbox:close($doc)
as empty-sequence(){
(# db:wrapjava void #) {
PDDocument:close($doc) PDDocument:close($doc)
}
}; };
declare function pdfbox:page-count($doc as item()) declare function pdfbox:page-count($doc as item())
@ -69,15 +72,15 @@ as map(*)*{
let $bookmark:= let $bookmark:=
PDDocument:getDocumentCatalog($doc) PDDocument:getDocumentCatalog($doc)
=>PDDocumentCatalog:getDocumentOutline() =>PDDocumentCatalog:getDocumentOutline()
=>PDOutlineItem:getFirstChild()=>trace("cur") =>PDOutlineItem:getFirstChild()
let $bk:=pdfbox:outline($bookmark ,$doc) let $bk:=pdfbox:outline($doc,$bookmark)
return $bk return $bk
} }
}; };
(: return bookmark info for children of $outlineItem :) (: return bookmark info for children of $outlineItem :)
declare function pdfbox:outline($outlineItem,$doc ) declare function pdfbox:outline($doc,$outlineItem )
as map(*)* as map(*)*
{ {
let $find:=hof:until( let $find:=hof:until(
@ -85,7 +88,7 @@ as map(*)*
function($input ) { function($input ) {
let $bk:= pdfbox:bookmark($input?this,$doc) let $bk:= pdfbox:bookmark($input?this,$doc)
let $bk:= if($bk?hasChildren) let $bk:= if($bk?hasChildren)
then let $kids:=pdfbox:outline(PDOutlineItem:getFirstChild($input?this), $doc) then let $kids:=pdfbox:outline($doc,PDOutlineItem:getFirstChild($input?this))
return map:merge(($bk,map:entry("children",$kids))) return map:merge(($bk,map:entry("children",$kids)))
else $bk else $bk
return map{ return map{
@ -97,16 +100,22 @@ as map(*)*
return $find?list return $find?list
}; };
declare function pdfbox:outline-XML($outline as map(*)*) declare function pdfbox:outline-xml($outline as map(*)*)
as element(*){ as element(outline){
element outline { element outline {
for $bookmark in $outline $outline!pdfbox:bookmark-xml(.)
return <bookmark title="{$bookmark?title}" index="{$bookmark?index}">
{$bookmark?children!pdfbox:outline-XML(.)}
</bookmark>
} }
}; };
declare function pdfbox:bookmark-xml($outline as map(*)*)
as element(bookmark)*
{
$outline!
<bookmark title="{?title}" index="{?index}">
{?children!pdfbox:bookmark-xml(.)}
</bookmark>
};
(: return bookmark info for children of $outlineItem :) (: return bookmark info for children of $outlineItem :)
declare function pdfbox:bookmark($bookmark as item(),$doc as item()) declare function pdfbox:bookmark($bookmark as item(),$doc as item())
as map(*){ as map(*){
@ -135,13 +144,14 @@ declare function pdfbox:pageIndex(
(:~ new PDF doc from 1 based page range :) (:~ new PDF doc from 1 based page range
declare function pdfbox:extract($doc as item(),$target as xs:string, @return save path :)
$start as xs:integer,$end as xs:integer) declare function pdfbox:extract($doc as item(),
$start as xs:integer,$end as xs:integer,$target as xs:string)
as xs:string
{ {
let $a:=PageExtractor:new($doc, $start, $end) =>PageExtractor:extract() let $a:=PageExtractor:new($doc, $start, $end) =>PageExtractor:extract()
let $map:=pdfbox:save($a,$target) return (pdfbox:save($a,$target),pdfbox:close($a))
return pdfbox:close($a)
}; };
@ -154,9 +164,6 @@ as item()*{
=>PDDocumentCatalog:getPageLabels() =>PDDocumentCatalog:getPageLabels()
=>PDPageLabels:getLabelsByPageIndices() =>PDPageLabels:getLabelsByPageIndices()
}; };
(:~ @TODO
@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files
:)
(: text on $pageNo :) (: text on $pageNo :)
declare function pdfbox:getText($doc as item(), $pageNo as xs:integer) declare function pdfbox:getText($doc as item(), $pageNo as xs:integer)

View File

@ -13,10 +13,10 @@ declare variable $samples:= map{
}; };
declare variable $base:= "C:\Users\mrwhe\git\bloomsbury\content-architecture\xquery\ABC-CLIO\data"; declare variable $base:= "C:\Users\mrwhe\git\bloomsbury\content-architecture\xquery\ABC-CLIO\data";
(:~ resolve :) (:~ resolve :)
declare variable $PDF:= $samples?climate=>file:resolve-path($base); declare variable $PDF:= $samples?women=>file:resolve-path($base);
let $doc:=pdfbox:open($PDF) let $doc:=pdfbox:open($PDF)
return pdfbox:outline($doc)=>pdfbox:outline-XML() return pdfbox:outline($doc)=>pdfbox:outline-xml()
(: return pdfbox:extract($doc,"c:\tmp\junk3.pdf",1,pdfbox:page-count($doc)) :) (: return pdfbox:extract($doc,"c:\tmp\junk3.pdf",1,pdfbox:page-count($doc)) :)