From a59038453ac38ba4b117875549f888fc3a95e7ff Mon Sep 17 00:00:00 2001 From: Andy Bunce Date: Sat, 9 Mar 2024 22:43:29 +0000 Subject: [PATCH] [add] version --- .vscode/settings.json | 3 +++ docs/pdfbox.xqbk | 2 +- src/lib/pdfbox3.xqm | 34 ++++++++++++++++++++++------------ 3 files changed, 26 insertions(+), 13 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..06abc03 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "basexTools.xquery.profile": "basex-10" +} \ No newline at end of file diff --git a/docs/pdfbox.xqbk b/docs/pdfbox.xqbk index 95f82c3..a0c7be7 100644 --- a/docs/pdfbox.xqbk +++ b/docs/pdfbox.xqbk @@ -1 +1 @@ -{"cells":[{"kind":1,"language":"markdown","value":"# pdfbox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"Comes with debug tool\r\n```\r\njava -jar debugger-app-3.0.1.jar\r\n```"},{"kind":1,"language":"markdown","value":"## Set up a XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox:3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};"},{"kind":1,"language":"markdown","value":"## page count"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-count($doc)"},{"kind":1,"language":"markdown","value":"## save range to new pdf"},{"kind":2,"language":"xquery","value":"(:~ use full path :)\r\ndeclare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:extract($doc,2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?climate=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:outline($doc)=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?climate=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getPageLabels($doc)"},{"kind":1,"language":"markdown","value":"## page text"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"## PageNo text analysis"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"# Inverted pageno map"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-map($doc)"}]} \ No newline at end of file +{"cells":[{"kind":1,"language":"markdown","value":"# pdfbox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"Comes with debug tool\r\n```\r\njava -jar debugger-app-3.0.1.jar\r\n```"},{"kind":1,"language":"markdown","value":"## Set up a XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox:3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};"},{"kind":1,"language":"markdown","value":"# Version in use"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":1,"language":"markdown","value":"## page count"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-count($doc)"},{"kind":1,"language":"markdown","value":"## save range to new pdf"},{"kind":2,"language":"xquery","value":"(:~ use full path :)\r\ndeclare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:extract($doc,2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= \r\n(: $samples?climate=>file:resolve-path($config:data); :)\r\n\"C:\\Users\\mrwhe\\Desktop\\1e\\set\\2-6-1\\A4512C_1\\257110---Book_File-Web_PDF_9798216013327_486681.pdf\";\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:outline($doc)=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?climate=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getPageLabels($doc)"},{"kind":1,"language":"markdown","value":"## page text"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"## PageNo text analysis"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"# Inverted pageno map"},{"kind":2,"language":"xquery","value":"declare variable $PDF:= $samples?women=>file:resolve-path($config:data);\r\nlet $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:page-map($doc)"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=\"C:\\Users\\mrwhe\\Desktop\\1e\\\";\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\nlet $outline:=pdfbox:outline($doc)\r\nlet $count:=count($outline)\r\norder by $count \r\nreturn ``[`{$f}`: `{$count}`]``"}]} \ No newline at end of file diff --git a/src/lib/pdfbox3.xqm b/src/lib/pdfbox3.xqm index f33bcb8..b6e6274 100644 --- a/src/lib/pdfbox3.xqm +++ b/src/lib/pdfbox3.xqm @@ -1,9 +1,10 @@ xquery version '3.1'; (:~ -pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10+ interface library, +pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library, requires pdfbox jar on classpath 3.02 required tested with pdfbox-app-3.0.2-20240121.184204-66.jar -@see https://lists.apache.org/list?users@pdfbox.apache.org:lte=1M:loader +@see https://repository.apache.org/content/groups/snapshots/org/apache/pdfbox/pdfbox-app/3.0.2-SNAPSHOT/ +@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/ :) module namespace pdfbox="urn:expkg-zone58:pdfbox:3"; @@ -39,9 +40,14 @@ declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive :) declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; -declare namespace File ="java:java.io.File"; declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; +declare namespace File ="java:java.io.File"; +(:~ version of pdfbox:) +declare function pdfbox:version() +as xs:string{ + Q{java:org.apache.pdfbox.util.Version}getVersion() +}; (:~ open pdf, returns handle :) declare function pdfbox:open($pdfpath as xs:string){ Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath)) @@ -79,7 +85,7 @@ as map(*)*{ } }; -(: return bookmark info for children of $outlineItem :) +(: return bookmark info for children of $outlineItem as seq of maps :) declare function pdfbox:outline($doc,$outlineItem ) as map(*)* { @@ -118,15 +124,17 @@ as element(bookmark)* (: return bookmark info for children of $outlineItem :) declare function pdfbox:bookmark($bookmark as item(),$doc as item()) -as map(*){ +as map(*) +{ map{ "index": PDOutlineItem:findDestinationPage($bookmark,$doc)=>pdfbox:pageIndex($doc), - "title": PDOutlineItem:getTitle($bookmark), + "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}, "hasChildren": PDOutlineItem:hasChildren($bookmark) } }; -declare function pdfbox:outx($page,$document){ +declare function pdfbox:outx($page,$document) +{ let $currentPage := PDOutlineItem:findDestinationPage($page,$document) let $pageNumber := pdfbox:pageIndex($currentPage,$document) return $pageNumber @@ -134,17 +142,19 @@ declare function pdfbox:outx($page,$document){ (:~ pageIndex of $page in $document :) declare function pdfbox:pageIndex( - $page (: as java:org.apache.pdfbox.pdmodel.PDPage :), + $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), $document) +as item()? { - PDDocument:getDocumentCatalog($document) - =>PDDocumentCatalog:getPages() - =>PDPageTree:indexOf($page) + if(exists($page)) + then PDDocument:getDocumentCatalog($document) + =>PDDocumentCatalog:getPages() + =>PDPageTree:indexOf($page) }; -(:~ new PDF doc from 1 based page range +(:~ save new PDF doc from 1 based page range @return save path :) declare function pdfbox:extract($doc as item(), $start as xs:integer,$end as xs:integer,$target as xs:string)