[add] notebook
This commit is contained in:
parent
e1c74c9608
commit
8ea1a343de
2
LICENSE
2
LICENSE
@ -186,7 +186,7 @@
|
|||||||
same "printed page" as the copyright notice for easier
|
same "printed page" as the copyright notice for easier
|
||||||
identification within third-party archives.
|
identification within third-party archives.
|
||||||
|
|
||||||
Copyright [yyyy] [name of copyright owner]
|
Copyright 2024 Andy Bunce
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
1
docs/pdfbox.xqbk
Normal file
1
docs/pdfbox.xqbk
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"cells":[{"kind":1,"language":"markdown","value":"# pdfbox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"Comes with debug tool\r\n```\r\njava -jar debugger-app-3.0.1.jar\r\n```"},{"kind":1,"language":"markdown","value":"## Set up a XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox:3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};"},{"kind":1,"language":"markdown","value":"## page count"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-count($doc)"},{"kind":1,"language":"markdown","value":"## save range to new pdf"},{"kind":2,"language":"xquery","value":"(:~ use full path :)\r\ndeclare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:extract($doc,2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?climate=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:outline($doc)=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?climate=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getPageLabels($doc)"},{"kind":1,"language":"markdown","value":"## page text"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"## PageNo text analysis"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"# Inverted pageno map"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-map($doc)"}]}
|
74
src/lib/pageno.xqm
Normal file
74
src/lib/pageno.xqm
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
xquery version '3.1';
|
||||||
|
(:~ look for pagenos in pdf text
|
||||||
|
pagenos:page-report($doc )=>pagenos:inverted-map()
|
||||||
|
:)
|
||||||
|
module namespace pagenos = 'urn:pageno';
|
||||||
|
import module namespace pdfbox="urn:expkg-zone58:pdfbox:3" at "pdfbox3.xqm";
|
||||||
|
|
||||||
|
(: look for possible page number in first/last line of page text
|
||||||
|
@todo last line and roman
|
||||||
|
1=Number system ( D=decimal, R=Roman)
|
||||||
|
2=Side L=left,R=right
|
||||||
|
:)
|
||||||
|
declare variable $pagenos:pats:=map{
|
||||||
|
"DL": "^([1-9][0-9]*).*",
|
||||||
|
"DR": ".*[^0-9]([1-9][0-9]*)$",
|
||||||
|
"RL": "^([ivxc]+).*",
|
||||||
|
"RR": ".*[^ivxc]([ivxc]+)$"
|
||||||
|
};
|
||||||
|
|
||||||
|
(: page-reports for all pages :)
|
||||||
|
declare function pagenos:page-report($doc as item())
|
||||||
|
as element(page)*{
|
||||||
|
let $count:=pdfbox:page-count($doc)=>trace("Pages: ")
|
||||||
|
return (0 to $count -1)!pagenos:page-report($doc,.)
|
||||||
|
};
|
||||||
|
|
||||||
|
(: page-report for given page :)
|
||||||
|
declare function pagenos:page-report($doc as item(), $page as xs:integer)
|
||||||
|
as element(page){
|
||||||
|
let $txt:=pdfbox:getText($doc,$page)
|
||||||
|
let $line1:=substring-before($txt,file:line-separator())
|
||||||
|
let $fn:=function($acc,$this){ $acc otherwise pagenos:line-report($this,$line1)}
|
||||||
|
let $found:=map:keys($pagenos:pats)=>fold-left( (),$fn)
|
||||||
|
|
||||||
|
return <page index="{ $page }">{ $found, $line1 }</page>
|
||||||
|
};
|
||||||
|
|
||||||
|
(: empty or attributes created by matching $style with $line1 :)
|
||||||
|
declare function pagenos:line-report($style as xs:string, $line1 as xs:string)
|
||||||
|
as attribute(*)*{
|
||||||
|
if(matches($line1,$pagenos:pats?($style)))
|
||||||
|
then (
|
||||||
|
attribute {"style"} { substring($style,1,1) } ,(: 1st key:)
|
||||||
|
attribute {"LR"} { substring($style,2,1) } ,(: 2nd key:)
|
||||||
|
attribute {"number"} { replace($line1,$pagenos:pats?($style),"$1") }
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
(:~ keys are parsed pageno values are pageindices where found:)
|
||||||
|
declare function pagenos:inverted-map($pages as element(page)*)
|
||||||
|
as map(*) {
|
||||||
|
$pages[@number]!map:entry(string(@number),string(@index))
|
||||||
|
=>map:merge(map{"duplicates":"combine"})
|
||||||
|
};
|
||||||
|
|
||||||
|
(:~ convert roman to integer, zero if invalid
|
||||||
|
@see https://joewiz.org/2021/05/30/converting-roman-numerals-with-xquery-xslt/
|
||||||
|
:)
|
||||||
|
declare function pagenos:decode-roman-numeral($roman-numeral as xs:string)
|
||||||
|
as xs:integer{
|
||||||
|
$roman-numeral => upper-case() => pagenos:characters()
|
||||||
|
=> for-each(map { "M": 1000, "D": 500, "C": 100, "L": 50, "X": 10, "V": 5, "I": 1 })
|
||||||
|
=> fold-right([0,0], function($number,$accumulator) {
|
||||||
|
if ($number lt $accumulator?2)
|
||||||
|
then [ $accumulator?1 - $number, $number ]
|
||||||
|
else [ $accumulator?1 + $number, $number ] } )
|
||||||
|
=> array:head()
|
||||||
|
};
|
||||||
|
|
||||||
|
(:~ xpath 4:)
|
||||||
|
declare function pagenos:characters($value as xs:string?)
|
||||||
|
as xs:string*{
|
||||||
|
fn:string-to-codepoints($value) ! fn:codepoints-to-string(.)
|
||||||
|
};
|
@ -1,6 +1,6 @@
|
|||||||
xquery version '3.1';
|
xquery version '3.1';
|
||||||
(:~
|
(:~
|
||||||
pdfbox 3.0 https://pdfbox.apache.org/ interface library,
|
pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10+ interface library,
|
||||||
requires pdfbox jar on classpath
|
requires pdfbox jar on classpath
|
||||||
3.02 required tested with pdfbox-app-3.0.2-20240121.184204-66.jar
|
3.02 required tested with pdfbox-app-3.0.2-20240121.184204-66.jar
|
||||||
@see https://lists.apache.org/list?users@pdfbox.apache.org:lte=1M:loader
|
@see https://lists.apache.org/list?users@pdfbox.apache.org:lte=1M:loader
|
||||||
@ -52,8 +52,11 @@ as xs:string{
|
|||||||
PDDocument:save($doc,File:new($savepath)),$savepath
|
PDDocument:save($doc,File:new($savepath)),$savepath
|
||||||
};
|
};
|
||||||
|
|
||||||
declare function pdfbox:close($doc){
|
declare function pdfbox:close($doc)
|
||||||
PDDocument:close($doc)
|
as empty-sequence(){
|
||||||
|
(# db:wrapjava void #) {
|
||||||
|
PDDocument:close($doc)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
declare function pdfbox:page-count($doc as item())
|
declare function pdfbox:page-count($doc as item())
|
||||||
@ -69,15 +72,15 @@ as map(*)*{
|
|||||||
let $bookmark:=
|
let $bookmark:=
|
||||||
PDDocument:getDocumentCatalog($doc)
|
PDDocument:getDocumentCatalog($doc)
|
||||||
=>PDDocumentCatalog:getDocumentOutline()
|
=>PDDocumentCatalog:getDocumentOutline()
|
||||||
=>PDOutlineItem:getFirstChild()=>trace("cur")
|
=>PDOutlineItem:getFirstChild()
|
||||||
|
|
||||||
let $bk:=pdfbox:outline($bookmark ,$doc)
|
let $bk:=pdfbox:outline($doc,$bookmark)
|
||||||
return $bk
|
return $bk
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
(: return bookmark info for children of $outlineItem :)
|
(: return bookmark info for children of $outlineItem :)
|
||||||
declare function pdfbox:outline($outlineItem,$doc )
|
declare function pdfbox:outline($doc,$outlineItem )
|
||||||
as map(*)*
|
as map(*)*
|
||||||
{
|
{
|
||||||
let $find:=hof:until(
|
let $find:=hof:until(
|
||||||
@ -85,7 +88,7 @@ as map(*)*
|
|||||||
function($input ) {
|
function($input ) {
|
||||||
let $bk:= pdfbox:bookmark($input?this,$doc)
|
let $bk:= pdfbox:bookmark($input?this,$doc)
|
||||||
let $bk:= if($bk?hasChildren)
|
let $bk:= if($bk?hasChildren)
|
||||||
then let $kids:=pdfbox:outline(PDOutlineItem:getFirstChild($input?this), $doc)
|
then let $kids:=pdfbox:outline($doc,PDOutlineItem:getFirstChild($input?this))
|
||||||
return map:merge(($bk,map:entry("children",$kids)))
|
return map:merge(($bk,map:entry("children",$kids)))
|
||||||
else $bk
|
else $bk
|
||||||
return map{
|
return map{
|
||||||
@ -97,16 +100,22 @@ as map(*)*
|
|||||||
return $find?list
|
return $find?list
|
||||||
};
|
};
|
||||||
|
|
||||||
declare function pdfbox:outline-XML($outline as map(*)*)
|
declare function pdfbox:outline-xml($outline as map(*)*)
|
||||||
as element(*){
|
as element(outline){
|
||||||
element outline {
|
element outline {
|
||||||
for $bookmark in $outline
|
$outline!pdfbox:bookmark-xml(.)
|
||||||
return <bookmark title="{$bookmark?title}" index="{$bookmark?index}">
|
|
||||||
{$bookmark?children!pdfbox:outline-XML(.)}
|
|
||||||
</bookmark>
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
declare function pdfbox:bookmark-xml($outline as map(*)*)
|
||||||
|
as element(bookmark)*
|
||||||
|
{
|
||||||
|
$outline!
|
||||||
|
<bookmark title="{?title}" index="{?index}">
|
||||||
|
{?children!pdfbox:bookmark-xml(.)}
|
||||||
|
</bookmark>
|
||||||
|
};
|
||||||
|
|
||||||
(: return bookmark info for children of $outlineItem :)
|
(: return bookmark info for children of $outlineItem :)
|
||||||
declare function pdfbox:bookmark($bookmark as item(),$doc as item())
|
declare function pdfbox:bookmark($bookmark as item(),$doc as item())
|
||||||
as map(*){
|
as map(*){
|
||||||
@ -135,13 +144,14 @@ declare function pdfbox:pageIndex(
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
(:~ new PDF doc from 1 based page range :)
|
(:~ new PDF doc from 1 based page range
|
||||||
declare function pdfbox:extract($doc as item(),$target as xs:string,
|
@return save path :)
|
||||||
$start as xs:integer,$end as xs:integer)
|
declare function pdfbox:extract($doc as item(),
|
||||||
|
$start as xs:integer,$end as xs:integer,$target as xs:string)
|
||||||
|
as xs:string
|
||||||
{
|
{
|
||||||
let $a:=PageExtractor:new($doc, $start, $end) =>PageExtractor:extract()
|
let $a:=PageExtractor:new($doc, $start, $end) =>PageExtractor:extract()
|
||||||
let $map:=pdfbox:save($a,$target)
|
return (pdfbox:save($a,$target),pdfbox:close($a))
|
||||||
return pdfbox:close($a)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -154,9 +164,6 @@ as item()*{
|
|||||||
=>PDDocumentCatalog:getPageLabels()
|
=>PDDocumentCatalog:getPageLabels()
|
||||||
=>PDPageLabels:getLabelsByPageIndices()
|
=>PDPageLabels:getLabelsByPageIndices()
|
||||||
};
|
};
|
||||||
(:~ @TODO
|
|
||||||
@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files
|
|
||||||
:)
|
|
||||||
|
|
||||||
(: text on $pageNo :)
|
(: text on $pageNo :)
|
||||||
declare function pdfbox:getText($doc as item(), $pageNo as xs:integer)
|
declare function pdfbox:getText($doc as item(), $pageNo as xs:integer)
|
@ -13,10 +13,10 @@ declare variable $samples:= map{
|
|||||||
};
|
};
|
||||||
declare variable $base:= "C:\Users\mrwhe\git\bloomsbury\content-architecture\xquery\ABC-CLIO\data";
|
declare variable $base:= "C:\Users\mrwhe\git\bloomsbury\content-architecture\xquery\ABC-CLIO\data";
|
||||||
(:~ resolve :)
|
(:~ resolve :)
|
||||||
declare variable $PDF:= $samples?climate=>file:resolve-path($base);
|
declare variable $PDF:= $samples?women=>file:resolve-path($base);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
let $doc:=pdfbox:open($PDF)
|
let $doc:=pdfbox:open($PDF)
|
||||||
return pdfbox:outline($doc)=>pdfbox:outline-XML()
|
return pdfbox:outline($doc)=>pdfbox:outline-xml()
|
||||||
(: return pdfbox:extract($doc,"c:\tmp\junk3.pdf",1,pdfbox:page-count($doc)) :)
|
(: return pdfbox:extract($doc,"c:\tmp\junk3.pdf",1,pdfbox:page-count($doc)) :)
|
Loading…
Reference in New Issue
Block a user