diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..84bfa3f --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +abc/ \ No newline at end of file diff --git a/README.md b/README.md index 26dcf89..8569f9f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,4 @@ # Pdfbox BaseX (10+) interface to [Pdfbox](https://pdfbox.apache.org/) 3 + + diff --git a/docs/pdfbox.xqbk b/docs/pdfbox.xqbk index a0c7be7..92162e3 100644 --- a/docs/pdfbox.xqbk +++ b/docs/pdfbox.xqbk @@ -1 +1 @@ -{"cells":[{"kind":1,"language":"markdown","value":"# pdfbox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"Comes with debug tool\r\n```\r\njava -jar debugger-app-3.0.1.jar\r\n```"},{"kind":1,"language":"markdown","value":"## Set up a XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox:3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};"},{"kind":1,"language":"markdown","value":"# Version in use"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":1,"language":"markdown","value":"## page count"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-count($doc)"},{"kind":1,"language":"markdown","value":"## save range to new pdf"},{"kind":2,"language":"xquery","value":"(:~ use full path :)\r\ndeclare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:extract($doc,2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= \r\n(: $samples?climate=>file:resolve-path($config:data); :)\r\n\"C:\\Users\\mrwhe\\Desktop\\1e\\set\\2-6-1\\A4512C_1\\257110---Book_File-Web_PDF_9798216013327_486681.pdf\";\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:outline($doc)=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?climate=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getPageLabels($doc)"},{"kind":1,"language":"markdown","value":"## page text"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"## PageNo text analysis"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"# Inverted pageno map"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-map($doc)"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=\"C:\\Users\\mrwhe\\Desktop\\1e\\\";\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\nlet $outline:=pdfbox:outline($doc)\r\nlet $count:=count($outline)\r\norder by $count \r\nreturn ``[`{$f}`: `{$count}`]``"}]} \ No newline at end of file +{"cells":[{"kind":1,"language":"markdown","value":"# PDFBox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"It comes with the useful PDF debug tool `java -jar debugger-app-3.0.1.jar`"},{"kind":1,"language":"markdown","value":"## Set up XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox:3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace pagenos = 'urn:pageno' at \"../src/lib/pageno.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};\r\ndeclare variable $PDF:= (: $samples?women=>file:resolve-path($config:data) :)\r\n\"C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO\\data\\drop-01e\\set\\2-6-1\\A5690C_1\\257107---Book_File-Web_PDF_9798400691218_486731.pdf\";"},{"kind":1,"language":"markdown","value":"# Version of pdfbox in use"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:pdfVersion()"},{"kind":1,"language":"markdown","value":"# Page count for PDF"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:page-count()"},{"kind":1,"language":"markdown","value":"## save range to new pdf"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:extract(2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":2,"language":"xquery","value":"\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:outline($doc)"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getPageLabels($doc)"},{"kind":1,"language":"markdown","value":"# getText from page index"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"## PageNo text analysis"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pagenos:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"# Inverted pageno map"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pagenos:page-report($doc)=>pagenos:inverted-map()"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=\"C:\\Users\\mrwhe\\Desktop\\1e\\\";\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\nlet $outline:=pdfbox:outline($doc)\r\nlet $count:=count($outline)\r\norder by $count \r\nreturn ``[`{$f}`: `{$count}`]``"}]} \ No newline at end of file diff --git a/samples.pdf/BaseX100.pdf b/samples.pdf/BaseX100.pdf new file mode 100644 index 0000000..4c6c05f Binary files /dev/null and b/samples.pdf/BaseX100.pdf differ diff --git a/src/lib/abc.xqm b/src/lib/abc.xqm new file mode 100644 index 0000000..a7fa812 --- /dev/null +++ b/src/lib/abc.xqm @@ -0,0 +1,5 @@ +xquery version '3.1'; +(:~ look for pagenos in pdf text +pagenos:page-report($doc )=>pagenos:inverted-map() +:) +module namespace pagenos = 'urn:pageno'; \ No newline at end of file diff --git a/src/lib/pageno.xqm b/src/lib/pageno.xqm index 7626b3f..2f9a65d 100644 --- a/src/lib/pageno.xqm +++ b/src/lib/pageno.xqm @@ -13,8 +13,8 @@ import module namespace pdfbox="urn:expkg-zone58:pdfbox:3" at "pdfbox3.xqm"; declare variable $pagenos:pats:=map{ "DL": "^([1-9][0-9]*).*", "DR": ".*[^0-9]([1-9][0-9]*)$", - "RL": "^([ivxc]+).*", - "RR": ".*[^ivxc]([ivxc]+)$" + "RL": "^([ivxlc]+).*", + "RR": ".*[^ivxlc]([ivxlc]+)$" }; (: page-reports for all pages :) diff --git a/src/lib/pdfbox3.xqm b/src/lib/pdfbox3.xqm index b6e6274..3c1c940 100644 --- a/src/lib/pdfbox3.xqm +++ b/src/lib/pdfbox3.xqm @@ -5,6 +5,7 @@ requires pdfbox jar on classpath 3.02 required tested with pdfbox-app-3.0.2-20240121.184204-66.jar @see https://repository.apache.org/content/groups/snapshots/org/apache/pdfbox/pdfbox-app/3.0.2-SNAPSHOT/ @javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/ + :) module namespace pdfbox="urn:expkg-zone58:pdfbox:3"; @@ -48,17 +49,26 @@ declare function pdfbox:version() as xs:string{ Q{java:org.apache.pdfbox.util.Version}getVersion() }; + (:~ open pdf, returns handle :) -declare function pdfbox:open($pdfpath as xs:string){ +declare function pdfbox:open($pdfpath as xs:string) +as item(){ Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath)) }; + +(:~ the PDF specification version this document conforms to.:) +declare function pdfbox:pdfVersion($doc as item()) +as xs:float{ + PDDocument:getVersion($doc) +}; + (:~ save pdf $doc to $savepath , returns $savepath :) -declare function pdfbox:save($doc,$savepath as xs:string) +declare function pdfbox:save($doc as item(),$savepath as xs:string) as xs:string{ PDDocument:save($doc,File:new($savepath)),$savepath }; -declare function pdfbox:close($doc) +declare function pdfbox:close($doc as item()) as empty-sequence(){ (# db:wrapjava void #) { PDDocument:close($doc) @@ -103,7 +113,7 @@ as map(*)* }, map{"list":(),"this":$outlineItem} ) - return $find?list + return $find?list }; declare function pdfbox:outline-xml($outline as map(*)*) @@ -128,7 +138,7 @@ as map(*) { map{ "index": PDOutlineItem:findDestinationPage($bookmark,$doc)=>pdfbox:pageIndex($doc), - "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}, + "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}=>translate("�",""), "hasChildren": PDOutlineItem:hasChildren($bookmark) } }; @@ -165,11 +175,13 @@ as xs:string }; -(:~ @TODO +(:~ pageLabel for every page +@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples @see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files :) declare function pdfbox:getPageLabels($doc as item()) -as item()*{ +as xs:string* +{ PDDocument:getDocumentCatalog($doc) =>PDDocumentCatalog:getPageLabels() =>PDPageLabels:getLabelsByPageIndices()