[mod] sample
This commit is contained in:
parent
a59038453a
commit
0659567f36
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
abc/
|
@ -1,2 +1,4 @@
|
||||
# Pdfbox
|
||||
BaseX (10+) interface to [Pdfbox](https://pdfbox.apache.org/) 3
|
||||
|
||||
|
||||
|
@ -1 +1 @@
|
||||
{"cells":[{"kind":1,"language":"markdown","value":"# pdfbox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"Comes with debug tool\r\n```\r\njava -jar debugger-app-3.0.1.jar\r\n```"},{"kind":1,"language":"markdown","value":"## Set up a XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox:3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};"},{"kind":1,"language":"markdown","value":"# Version in use"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":1,"language":"markdown","value":"## page count"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-count($doc)"},{"kind":1,"language":"markdown","value":"## save range to new pdf"},{"kind":2,"language":"xquery","value":"(:~ use full path :)\r\ndeclare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:extract($doc,2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= \r\n(: $samples?climate=>file:resolve-path($config:data); :)\r\n\"C:\\Users\\mrwhe\\Desktop\\1e\\set\\2-6-1\\A4512C_1\\257110---Book_File-Web_PDF_9798216013327_486681.pdf\";\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:outline($doc)=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?climate=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getPageLabels($doc)"},{"kind":1,"language":"markdown","value":"## page text"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"## PageNo text analysis"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"# Inverted pageno map"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-map($doc)"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=\"C:\\Users\\mrwhe\\Desktop\\1e\\\";\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\nlet $outline:=pdfbox:outline($doc)\r\nlet $count:=count($outline)\r\norder by $count \r\nreturn ``[`{$f}`: `{$count}`]``"}]}
|
||||
{"cells":[{"kind":1,"language":"markdown","value":"# PDFBox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"It comes with the useful PDF debug tool `java -jar debugger-app-3.0.1.jar`"},{"kind":1,"language":"markdown","value":"## Set up XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox:3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace pagenos = 'urn:pageno' at \"../src/lib/pageno.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};\r\ndeclare variable $PDF:= (: $samples?women=>file:resolve-path($config:data) :)\r\n\"C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO\\data\\drop-01e\\set\\2-6-1\\A5690C_1\\257107---Book_File-Web_PDF_9798400691218_486731.pdf\";"},{"kind":1,"language":"markdown","value":"# Version of pdfbox in use"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:pdfVersion()"},{"kind":1,"language":"markdown","value":"# Page count for PDF"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:page-count()"},{"kind":1,"language":"markdown","value":"## save range to new pdf"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:extract(2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":2,"language":"xquery","value":"\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:outline($doc)"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getPageLabels($doc)"},{"kind":1,"language":"markdown","value":"# getText from page index"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"## PageNo text analysis"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pagenos:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"# Inverted pageno map"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pagenos:page-report($doc)=>pagenos:inverted-map()"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=\"C:\\Users\\mrwhe\\Desktop\\1e\\\";\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\nlet $outline:=pdfbox:outline($doc)\r\nlet $count:=count($outline)\r\norder by $count \r\nreturn ``[`{$f}`: `{$count}`]``"}]}
|
BIN
samples.pdf/BaseX100.pdf
Normal file
BIN
samples.pdf/BaseX100.pdf
Normal file
Binary file not shown.
5
src/lib/abc.xqm
Normal file
5
src/lib/abc.xqm
Normal file
@ -0,0 +1,5 @@
|
||||
xquery version '3.1';
|
||||
(:~ look for pagenos in pdf text
|
||||
pagenos:page-report($doc )=>pagenos:inverted-map()
|
||||
:)
|
||||
module namespace pagenos = 'urn:pageno';
|
@ -13,8 +13,8 @@ import module namespace pdfbox="urn:expkg-zone58:pdfbox:3" at "pdfbox3.xqm";
|
||||
declare variable $pagenos:pats:=map{
|
||||
"DL": "^([1-9][0-9]*).*",
|
||||
"DR": ".*[^0-9]([1-9][0-9]*)$",
|
||||
"RL": "^([ivxc]+).*",
|
||||
"RR": ".*[^ivxc]([ivxc]+)$"
|
||||
"RL": "^([ivxlc]+).*",
|
||||
"RR": ".*[^ivxlc]([ivxlc]+)$"
|
||||
};
|
||||
|
||||
(: page-reports for all pages :)
|
||||
|
@ -5,6 +5,7 @@ requires pdfbox jar on classpath
|
||||
3.02 required tested with pdfbox-app-3.0.2-20240121.184204-66.jar
|
||||
@see https://repository.apache.org/content/groups/snapshots/org/apache/pdfbox/pdfbox-app/3.0.2-SNAPSHOT/
|
||||
@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/
|
||||
|
||||
:)
|
||||
module namespace pdfbox="urn:expkg-zone58:pdfbox:3";
|
||||
|
||||
@ -48,17 +49,26 @@ declare function pdfbox:version()
|
||||
as xs:string{
|
||||
Q{java:org.apache.pdfbox.util.Version}getVersion()
|
||||
};
|
||||
|
||||
(:~ open pdf, returns handle :)
|
||||
declare function pdfbox:open($pdfpath as xs:string){
|
||||
declare function pdfbox:open($pdfpath as xs:string)
|
||||
as item(){
|
||||
Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath))
|
||||
};
|
||||
|
||||
(:~ the PDF specification version this document conforms to.:)
|
||||
declare function pdfbox:pdfVersion($doc as item())
|
||||
as xs:float{
|
||||
PDDocument:getVersion($doc)
|
||||
};
|
||||
|
||||
(:~ save pdf $doc to $savepath , returns $savepath :)
|
||||
declare function pdfbox:save($doc,$savepath as xs:string)
|
||||
declare function pdfbox:save($doc as item(),$savepath as xs:string)
|
||||
as xs:string{
|
||||
PDDocument:save($doc,File:new($savepath)),$savepath
|
||||
};
|
||||
|
||||
declare function pdfbox:close($doc)
|
||||
declare function pdfbox:close($doc as item())
|
||||
as empty-sequence(){
|
||||
(# db:wrapjava void #) {
|
||||
PDDocument:close($doc)
|
||||
@ -103,7 +113,7 @@ as map(*)*
|
||||
},
|
||||
map{"list":(),"this":$outlineItem}
|
||||
)
|
||||
return $find?list
|
||||
return $find?list
|
||||
};
|
||||
|
||||
declare function pdfbox:outline-xml($outline as map(*)*)
|
||||
@ -128,7 +138,7 @@ as map(*)
|
||||
{
|
||||
map{
|
||||
"index": PDOutlineItem:findDestinationPage($bookmark,$doc)=>pdfbox:pageIndex($doc),
|
||||
"title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)},
|
||||
"title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}=>translate("<22>",""),
|
||||
"hasChildren": PDOutlineItem:hasChildren($bookmark)
|
||||
}
|
||||
};
|
||||
@ -165,11 +175,13 @@ as xs:string
|
||||
};
|
||||
|
||||
|
||||
(:~ @TODO
|
||||
(:~ pageLabel for every page
|
||||
@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples
|
||||
@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files
|
||||
:)
|
||||
declare function pdfbox:getPageLabels($doc as item())
|
||||
as item()*{
|
||||
as xs:string*
|
||||
{
|
||||
PDDocument:getDocumentCatalog($doc)
|
||||
=>PDDocumentCatalog:getPageLabels()
|
||||
=>PDPageLabels:getLabelsByPageIndices()
|
||||
|
Loading…
Reference in New Issue
Block a user