diff --git a/.gitea/workflows/ci-basex.yaml b/.gitea/workflows/ci-basex.yaml index 75c6359..f66e985 100644 --- a/.gitea/workflows/ci-basex.yaml +++ b/.gitea/workflows/ci-basex.yaml @@ -10,17 +10,23 @@ on: jobs: - test: - runs-on: basex-10.7 + test: + runs-on: ubuntu-latest steps: - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: 18 - - name: Checkout repository uses: actions/checkout@v2 + - name: Set up Java + uses: actions/setup-java@v2 + with: + java-version: '11' + + - name: Install BaseX + run: | + wget http://files.basex.org/releases/9.6.3/BaseX963.zip + unzip BaseX963.zip -d basex + - name: Run BaseX Tests - run: 'basex/bin/basex -t .' \ No newline at end of file + run: | + ./basex/bin/basex -c"RUN tests/test.bxs" \ No newline at end of file diff --git a/docs/pdfbox.xqbk b/docs/pdfbox.xqbk index 9909d4d..d19e4fd 100644 --- a/docs/pdfbox.xqbk +++ b/docs/pdfbox.xqbk @@ -1 +1 @@ -{"cells":[{"kind":1,"language":"markdown","value":"# PDFBox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"It comes with the useful PDF debug tool `java -jar debugger-app-3.0.2.jar`"},{"kind":1,"language":"markdown","value":"## Set up XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace bookpages = 'urn:bookpages' at \"../src/lib/bookpages.xqm\";\r\nimport module namespace pdfscrape = 'urn:pdfscrape' at \"../src/lib/pdfscrape.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};\r\ndeclare variable $PDF:= (: $samples?women=>file:resolve-path($config:data) :)\r\n\"C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO\\data\\drop-01e\\set\\2-6-1\\A5690C_1\\257107---Book_File-Web_PDF_9798400691218_486731.pdf\";"},{"kind":1,"language":"markdown","value":" ## Check pdfbox version"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":1,"language":"markdown","value":"PDF specification version used by document"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:pdfVersion()"},{"kind":1,"language":"markdown","value":"# Page count for PDF"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:page-count()"},{"kind":1,"language":"markdown","value":"# save range to new pdf"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:extract(2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":1,"language":"markdown","value":"### sequence of maps"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:outline()"},{"kind":1,"language":"markdown","value":"XML"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:outline()=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:pageLabels()"},{"kind":1,"language":"markdown","value":"# getText from page index"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"# Page scraping"},{"kind":1,"language":"markdown","value":"## pdf scrape text analysis"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"## Inverted pageno map"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)=>pdfscrape:inverted-map()"},{"kind":1,"language":"markdown","value":"# Save images"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(99,1)\r\n=>pdfbox:imageSave(\"c:\\tmp\\page3.png\",\"png\")\r\n"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(3,0.25)\r\n=>pdfbox:imageBinary(\"jpg\")"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\n(: let $outline:=pdfbox:outline($doc) :)\r\nlet $count:=pdfbox:page-count($doc)\r\norder by $count \r\nreturn ``[`{$f}`: `{ $count }`]``"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f at $pos in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nreturn pdfbox:open(file:resolve-path($f,$a))\r\n=> pdfbox:pageAsImage(0,0.25)\r\n=> pdfbox:imageSave(``[c:\\tmp\\titles\\p`{$pos}`.gif]``,\"gif\")"}]} \ No newline at end of file +{"cells":[{"kind":1,"language":"markdown","value":"# PDFBox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"It comes with the useful PDF debug tool `java -jar debugger-app-3.0.2.jar`"},{"kind":1,"language":"markdown","value":"## Set up XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace bookpages = 'urn:bookpages' at \"../src/lib/bookpages.xqm\";\r\nimport module namespace pdfscrape = 'urn:pdfscrape' at \"../src/lib/pdfscrape.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};\r\ndeclare variable $PDF:= (: $samples?women=>file:resolve-path($config:data) :)\"C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO\\data\\drop-01e\\set\\2-6-1\\A5690C_1\\257107---Book_File-Web_PDF_9798400691218_486731.pdf\"; \r\n"},{"kind":1,"language":"markdown","value":" ## Check pdfbox version"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":1,"language":"markdown","value":"PDF specification version used by document"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:pdfVersion()"},{"kind":1,"language":"markdown","value":"# Page count for PDF"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:page-count()"},{"kind":1,"language":"markdown","value":"# save range to new pdf"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:extract(2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":1,"language":"markdown","value":"### sequence of maps"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:outline()"},{"kind":1,"language":"markdown","value":"XML"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:outline()=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:pageLabels()"},{"kind":1,"language":"markdown","value":"# getText from page index"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"# Page scraping"},{"kind":1,"language":"markdown","value":"## pdf scrape text analysis"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"## Inverted pageno map"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)=>pdfscrape:inverted-map()"},{"kind":1,"language":"markdown","value":"# Save images"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(99,1)\r\n=>pdfbox:imageSave(\"c:\\tmp\\page3.png\",\"png\")\r\n"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(3,0.25)\r\n=>pdfbox:imageBinary(\"jpg\")"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\n(: let $outline:=pdfbox:outline($doc) :)\r\nlet $count:=pdfbox:page-count($doc)\r\norder by $count \r\nreturn ``[`{$f}`: `{ $count }`]``"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f at $pos in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nreturn pdfbox:open(file:resolve-path($f,$a))\r\n=> pdfbox:pageAsImage(0,0.25)\r\n=> pdfbox:imageSave(``[c:\\tmp\\titles\\p`{$pos}`.gif]``,\"gif\")"}]} \ No newline at end of file diff --git a/lib/pdfbox-3.0.3/fontbox-3.0.3.jar b/jars/lib/fontbox-3.0.3.jar similarity index 100% rename from lib/pdfbox-3.0.3/fontbox-3.0.3.jar rename to jars/lib/fontbox-3.0.3.jar diff --git a/lib/pdfbox-3.0.3/pdfbox-io-3.0.3.jar b/jars/lib/pdfbox-io-3.0.3.jar similarity index 100% rename from lib/pdfbox-3.0.3/pdfbox-io-3.0.3.jar rename to jars/lib/pdfbox-io-3.0.3.jar diff --git a/jars/loader.xqm b/jars/loader.xqm new file mode 100644 index 0000000..221d5a4 --- /dev/null +++ b/jars/loader.xqm @@ -0,0 +1,272 @@ +xquery version '3.1'; +(:~ +pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library, +requires pdfbox jar on classpath +3.02+ required tested with pdfbox-app-3.0.2.jar +@see download https://pdfbox.apache.org/download.cgi +@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.2/ + +:) +module namespace pdfbox="org.apache.pdfbox.Loader"; + +declare namespace Loader ="java:org.apache.pdfbox.Loader"; +declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper"; + +(:~ +@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/PDDocument.html +:) +declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; + +declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; +declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; + +(:~ +@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/multipdf/PageExtractor.html +:) +declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; + +(:~ + @see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/PDPageTree.html +:) +declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; + +(:~ +@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.2/org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDDocumentOutline.html +:) +declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline"; + +declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; +(:~ +@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDOutlineItem.html +:) +declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; +declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; +declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile"; +declare namespace File ="java:java.io.File"; + +(:~ version of pdfbox:) +declare function pdfbox:version() +as xs:string{ + Q{java:org.apache.pdfbox.util.Version}getVersion() +}; + +(:~ open pdf, returns handle :) +declare function pdfbox:open($pdfpath as xs:string) +as item(){ + Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath)) +}; + +(:~ the PDF specification version this document conforms to.:) +declare function pdfbox:pdfVersion($doc as item()) +as xs:float{ + PDDocument:getVersion($doc) +}; + +(:~ save pdf $doc to $savepath , returns $savepath :) +declare function pdfbox:save($doc as item(),$savepath as xs:string) +as xs:string{ + PDDocument:save($doc,File:new($savepath)),$savepath +}; + +declare function pdfbox:close($doc as item()) +as empty-sequence(){ + (# db:wrapjava void #) { + PDDocument:close($doc) + } +}; + +declare function pdfbox:page-count($doc as item()) +as xs:integer{ + PDDocument:getNumberOfPages($doc) +}; + +(:~ map with document metadata :) +declare function pdfbox:information($doc as item()) +as map(*){ + let $info:=PDDocument:getDocumentInformation($doc) + return map{ + "title": PDDocumentInformation:getTitle($info), + "creator": PDDocumentInformation:getCreator($info), + "producer": PDDocumentInformation:getProducer($info), + "subject": PDDocumentInformation:getSubject($info), + "keywords": PDDocumentInformation:getKeywords($info), + "creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate($info)), + "author": PDDocumentInformation:getAuthor($info) + } +}; + + (:~ convert date :) +declare +function pdfbox:gregToISO($item as item()) +as xs:string{ + Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string() +}; + +(:~ outline for $doc as map()* :) +declare function pdfbox:outline($doc as item()) +as map(*)*{ + (# db:wrapjava some #) { + let $outline:= + PDDocument:getDocumentCatalog($doc) + =>PDDocumentCatalog:getDocumentOutline() + + return if(exists($outline)) + then pdfbox:outline($doc,PDOutlineItem:getFirstChild($outline)) + } +}; + +(:~ return bookmark info for children of $outlineItem as seq of maps :) +declare function pdfbox:outline($doc as item(),$outlineItem as item()?) + +as map(*)*{ + let $find as map(*):=pdfbox:_outline($doc ,$outlineItem) + return map:get($find,"list") +}; + +(: BaseX bug 10.7? error if inlined in outline :) +declare function pdfbox:_outline($doc as item(),$outlineItem as item()?) +as map(*){ + hof:until( + function($output) { empty($output?this) }, + function($input ) { + let $bk:= pdfbox:bookmark($input?this,$doc) + let $bk:= if($bk?hasChildren) + then let $kids:=pdfbox:outline($doc,PDOutlineItem:getFirstChild($input?this)) + return map:merge(($bk,map:entry("children",$kids))) + else $bk + return map{ + "list": ($input?list, $bk), + "this": PDOutlineItem:getNextSibling($input?this)} + }, + map{"list":(),"this":$outlineItem} + ) +}; +(:~ outline as xml :) +declare function pdfbox:outline-xml($outline as map(*)*) +as element(outline){ + element outline { + $outline!pdfbox:bookmark-xml(.) + } +}; + +declare function pdfbox:bookmark-xml($outline as map(*)*) +as element(bookmark)* +{ + $outline! + + {?children!pdfbox:bookmark-xml(.)} + +}; + +(: return bookmark info for children of $outlineItem :) +declare function pdfbox:bookmark($bookmark as item(),$doc as item()) +as map(*) +{ + map{ + "index": PDOutlineItem:findDestinationPage($bookmark,$doc)=>pdfbox:pageIndex($doc), + "title": (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}=>translate("�",""), + "hasChildren": PDOutlineItem:hasChildren($bookmark) + } +}; + +declare function pdfbox:outx($page ,$document) +{ + let $currentPage := PDOutlineItem:findDestinationPage($page,$document) + let $pageNumber := pdfbox:pageIndex($currentPage,$document) + return $pageNumber +}; + +(:~ pageIndex of $page in $document :) +declare function pdfbox:pageIndex( + $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), + $document) +as item()? +{ + if(exists($page)) + then PDDocument:getDocumentCatalog($document) + =>PDDocumentCatalog:getPages() + =>PDPageTree:indexOf($page) +}; + + + +(:~ save new PDF doc from 1 based page range +@return save path :) +declare function pdfbox:extract($doc as item(), + $start as xs:integer,$end as xs:integer,$target as xs:string) +as xs:string +{ + let $a:=PageExtractor:new($doc, $start, $end) =>PageExtractor:extract() + return (pdfbox:save($a,$target),pdfbox:close($a)) +}; + + +(:~ pageLabel info +@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples +@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files +:) +declare function pdfbox:getPageLabels($doc as item()) +as item() +{ + PDDocument:getDocumentCatalog($doc) + =>PDDocumentCatalog:getPageLabels() +}; + +(:~ pageLabel for every page:) +declare function pdfbox:pageLabels($doc as item()) +as xs:string* +{ + PDDocument:getDocumentCatalog($doc) + =>PDDocumentCatalog:getPageLabels() + =>PDPageLabels:getLabelsByPageIndices() +}; + +(:~ return text on $pageNo :) +declare function pdfbox:getText($doc as item(), $pageNo as xs:integer) +as xs:string{ + let $tStripper := (# db:wrapjava instance #) { + PDFTextStripper:new() + => PDFTextStripper:setStartPage($pageNo) + => PDFTextStripper:setEndPage($pageNo) + } + return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)} +}; + +(:~ summary info as map for $pdfpath :) +declare function pdfbox:report($pdfpath as xs:string) +as map(*){ + let $doc:=pdfbox:open($pdfpath) + return (map{ + "file": $pdfpath, + "pages": pdfbox:page-count($doc), + "outline": pdfbox:outline($doc)=>count() + },pdfbox:information($doc) +)=>map:merge() +}; + +(:~ java:bufferedImage for $pageNo using $scale times dpi= 72 +@param $pageNo (ZERO based) +@param $scale 1=72 dpi +@return Java java.awt.image.BufferedImage object +:) +declare function pdfbox:pageBufferedImage($doc as item(), $pageNo as xs:integer,$scale as xs:float) +as item(){ + PDFRenderer:new($doc)=>PDFRenderer:renderImage($pageNo,$scale) +}; + +(:~ save bufferedimage to $dest +@param $type = "gif","png" etc:) +declare function pdfbox:imageSave($bufferedImage as item(),$dest as xs:string,$type as xs:string) +as xs:boolean{ + Q{java:javax.imageio.ImageIO}write($bufferedImage , $type, File:new($dest)) +}; + +(:~ return image +@param $type = "gif","png" etc:) +declare function pdfbox:imageBinary($bufferedImage as item(),$type as xs:string) +as xs:base64Binary{ + let $bytes:=Q{java:java.io.ByteArrayOutputStream}new() + let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage , $type, $bytes) + return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes) + =>convert:integers-to-base64() +}; \ No newline at end of file diff --git a/lib/pdfbox-3.0.3/pdfbox-3.0.3.jar b/jars/pdfbox-3.0.3.jar similarity index 100% rename from lib/pdfbox-3.0.3/pdfbox-3.0.3.jar rename to jars/pdfbox-3.0.3.jar diff --git a/lib/pdfbox-3.0.3.fat.jar b/lib/pdfbox-3.0.3.fat.jar new file mode 100644 index 0000000..f535e5b Binary files /dev/null and b/lib/pdfbox-3.0.3.fat.jar differ diff --git a/scripts/build.xqm b/scripts/build.xqm new file mode 100644 index 0000000..9cfa38b --- /dev/null +++ b/scripts/build.xqm @@ -0,0 +1,51 @@ +(:~ build utils for REPO packaging :) +module namespace build = 'urn:quodatum:build1'; + +(:~ create a flat fat jar :) +declare function build:fatjar-from-folder($input-dir as xs:string,$manifest-jar as xs:string) +as xs:base64Binary { + let $fold := +function ($res as map (*), $jar as xs:string) { + let $bin :=file:read-binary($input-dir || $jar), + $paths := archive:entries($bin)/string() + [$jar eq $manifest-jar or not(starts-with( .,"META-INF/"))] + return + map { "name" : ($res? name, $paths), + "content" : ($res? content,archive:extract-binary($bin, $paths)) } +} +let $res := fold-left(file:list($input-dir, false(), "*.jar"), map { }, $fold) +return + archive:create($res? name, $res? content, + map { "format" : "zip", "algorithm" : "deflate" }) +}; + +(:~ create a fat jar with lib :) +declare function build:fatjar-with-lib($input-dir as xs:string,$manifest-jar as xs:string) + { + let $bin :=file:read-binary($input-dir || $manifest-jar) + + let $lib:=file:list($input-dir || "lib/", false(), "*.jar")!concat("lib/",.) + let $name:= (archive:entries($bin)/string() + ,$lib) + let $content:=(archive:extract-binary($bin,$name) + ,$lib!file:read-binary($input-dir || .)) +return archive:create($name, $content, + map { "format" : "zip", "algorithm" : "deflate" }) +}; + +(:~ update-manifest :) +declare function build:update-manifest($jar as xs:base64Binary,$main-class as xs:string) +as xs:base64Binary{ +(: let $mf:=archive:extract-text($jar,"META-INF/MANIFEST.MF") :) + +let $mf2:=concat("Manifest-Version: 1.0 Main-Class: ", + $main-class, + " ") +return archive:update($jar,"META-INF/MANIFEST.MF",$mf2) +}; + +(:~ update-manifest :) +declare function build:update($jar as xs:base64Binary,$name as xs:string,$file as xs:string) +as xs:base64Binary{ +archive:update($jar,$name,$file) +}; diff --git a/scripts/make-fat-jar.xq b/scripts/make-fat-jar.xq new file mode 100644 index 0000000..fe4cfbf --- /dev/null +++ b/scripts/make-fat-jar.xq @@ -0,0 +1,23 @@ + +import module namespace build = 'urn:quodatum:build1' at 'build.xqm'; + +(: Main execution +Main-Class: org.basex.modules.Hello + :) +let $config :=map { + "manifest-jar" : "pdfbox-3.0.3.jar", + "input-dir" : "C:\Users\mrwhe\git\expkg-zone58\pdfbox\jars\", + "output" : "../lib/pdfbox-3.0.3.fat.jar", + "main-class": "org.apache.pdfbox.Loader" + } + +let $fat-jar := build:fatjar-with-lib($config?input-dir,$config?manifest-jar) + +let $fat-jar:=build:update-manifest($fat-jar, $config?main-class) +let $name:=replace($config?main-class,"\.","/") || ".xqm" +let $content:=file:read-binary($config?input-dir || "loader.xqm") +let $fat-jar:=archive:update($fat-jar, $name,$content) +let $output-file := file:resolve-path($config?output, $config?input-dir) +return (file:write-binary($output-file, $fat-jar), + trace($output-file,"fat jar: ")) + \ No newline at end of file diff --git a/src/test/test.xqm b/src/test/test.xqm index f7a57a1..2426c8e 100644 --- a/src/test/test.xqm +++ b/src/test/test.xqm @@ -2,7 +2,7 @@ :) module namespace test="urn:expkg-zone58:pdfbox3:tests"; -import module namespace pdfbox="urn:expkg-zone58:pdfbox3" at "../lib/pdfbox3.xqm"; +import module namespace pdfbox="org.apache.pdfbox.Loader"; declare variable $test:base:=file:base-dir()=>file:parent()=>file:parent();