[fix] combined packaging
This commit is contained in:
		
							parent
							
								
									4bcfaefcc0
								
							
						
					
					
						commit
						1fe7edc1f7
					
				
					 10 changed files with 362 additions and 10 deletions
				
			
		| 
						 | 
				
			
			@ -10,17 +10,23 @@ on:
 | 
			
		|||
 | 
			
		||||
  
 | 
			
		||||
jobs:
 | 
			
		||||
  test:
 | 
			
		||||
    runs-on:  basex-10.7
 | 
			
		||||
 test:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
 | 
			
		||||
    steps:
 | 
			
		||||
    - name: Set up Node.js
 | 
			
		||||
      uses: actions/setup-node@v4
 | 
			
		||||
      with:
 | 
			
		||||
       node-version: 18
 | 
			
		||||
 | 
			
		||||
    - name: Checkout repository
 | 
			
		||||
      uses: actions/checkout@v2
 | 
			
		||||
 | 
			
		||||
    - name: Set up Java
 | 
			
		||||
      uses: actions/setup-java@v2
 | 
			
		||||
      with:
 | 
			
		||||
        java-version: '11'
 | 
			
		||||
 | 
			
		||||
    - name: Install BaseX
 | 
			
		||||
      run: |
 | 
			
		||||
        wget http://files.basex.org/releases/9.6.3/BaseX963.zip
 | 
			
		||||
        unzip BaseX963.zip -d basex
 | 
			
		||||
 | 
			
		||||
    - name: Run BaseX Tests
 | 
			
		||||
      run:  'basex/bin/basex -t .'
 | 
			
		||||
      run: |
 | 
			
		||||
        ./basex/bin/basex -c"RUN tests/test.bxs"
 | 
			
		||||
| 
						 | 
				
			
			@ -1 +1 @@
 | 
			
		|||
{"cells":[{"kind":1,"language":"markdown","value":"# PDFBox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"It comes with the useful PDF debug tool `java -jar debugger-app-3.0.2.jar`"},{"kind":1,"language":"markdown","value":"## Set up XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace bookpages = 'urn:bookpages'  at \"../src/lib/bookpages.xqm\";\r\nimport module namespace pdfscrape = 'urn:pdfscrape'  at \"../src/lib/pdfscrape.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n    \"climate\":  \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n    \"women\":    \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n    \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n    \"world\":    \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};\r\ndeclare variable $PDF:= (: $samples?women=>file:resolve-path($config:data) :)\r\n\"C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO\\data\\drop-01e\\set\\2-6-1\\A5690C_1\\257107---Book_File-Web_PDF_9798400691218_486731.pdf\";"},{"kind":1,"language":"markdown","value":" ## Check pdfbox version"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":1,"language":"markdown","value":"PDF specification version used by document"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:pdfVersion()"},{"kind":1,"language":"markdown","value":"# Page count for PDF"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:page-count()"},{"kind":1,"language":"markdown","value":"# save range to new pdf"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:extract(2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":1,"language":"markdown","value":"### sequence of maps"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:outline()"},{"kind":1,"language":"markdown","value":"XML"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:outline()=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:pageLabels()"},{"kind":1,"language":"markdown","value":"#  getText from page index"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"# Page scraping"},{"kind":1,"language":"markdown","value":"## pdf scrape text analysis"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"## Inverted pageno map"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)=>pdfscrape:inverted-map()"},{"kind":1,"language":"markdown","value":"# Save images"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(99,1)\r\n=>pdfbox:imageSave(\"c:\\tmp\\page3.png\",\"png\")\r\n"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(3,0.25)\r\n=>pdfbox:imageBinary(\"jpg\")"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\n(: let $outline:=pdfbox:outline($doc) :)\r\nlet $count:=pdfbox:page-count($doc)\r\norder by $count \r\nreturn ``[`{$f}`: `{ $count }`]``"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f at $pos in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nreturn pdfbox:open(file:resolve-path($f,$a))\r\n=> pdfbox:pageAsImage(0,0.25)\r\n=> pdfbox:imageSave(``[c:\\tmp\\titles\\p`{$pos}`.gif]``,\"gif\")"}]}
 | 
			
		||||
{"cells":[{"kind":1,"language":"markdown","value":"# PDFBox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"It comes with the useful PDF debug tool `java -jar debugger-app-3.0.2.jar`"},{"kind":1,"language":"markdown","value":"## Set up XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace bookpages = 'urn:bookpages'  at \"../src/lib/bookpages.xqm\";\r\nimport module namespace pdfscrape = 'urn:pdfscrape'  at \"../src/lib/pdfscrape.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n    \"climate\":  \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n    \"women\":    \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n    \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n    \"world\":    \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};\r\ndeclare variable $PDF:= (: $samples?women=>file:resolve-path($config:data) :)\"C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO\\data\\drop-01e\\set\\2-6-1\\A5690C_1\\257107---Book_File-Web_PDF_9798400691218_486731.pdf\"; \r\n"},{"kind":1,"language":"markdown","value":" ## Check pdfbox version"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":1,"language":"markdown","value":"PDF specification version used by document"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:pdfVersion()"},{"kind":1,"language":"markdown","value":"# Page count for PDF"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:page-count()"},{"kind":1,"language":"markdown","value":"# save range to new pdf"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:extract(2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":1,"language":"markdown","value":"### sequence of maps"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:outline()"},{"kind":1,"language":"markdown","value":"XML"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:outline()=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:pageLabels()"},{"kind":1,"language":"markdown","value":"#  getText from page index"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"# Page scraping"},{"kind":1,"language":"markdown","value":"## pdf scrape text analysis"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"## Inverted pageno map"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)=>pdfscrape:inverted-map()"},{"kind":1,"language":"markdown","value":"# Save images"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(99,1)\r\n=>pdfbox:imageSave(\"c:\\tmp\\page3.png\",\"png\")\r\n"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageBufferedImage(3,0.25)\r\n=>pdfbox:imageBinary(\"jpg\")"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\n(: let $outline:=pdfbox:outline($doc) :)\r\nlet $count:=pdfbox:page-count($doc)\r\norder by $count \r\nreturn ``[`{$f}`: `{ $count }`]``"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f at $pos in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nreturn pdfbox:open(file:resolve-path($f,$a))\r\n=> pdfbox:pageAsImage(0,0.25)\r\n=> pdfbox:imageSave(``[c:\\tmp\\titles\\p`{$pos}`.gif]``,\"gif\")"}]}
 | 
			
		||||
							
								
								
									
										272
									
								
								jars/loader.xqm
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										272
									
								
								jars/loader.xqm
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,272 @@
 | 
			
		|||
xquery version '3.1';
 | 
			
		||||
(:~ 
 | 
			
		||||
pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library, 
 | 
			
		||||
requires pdfbox jar on classpath
 | 
			
		||||
3.02+ required tested with pdfbox-app-3.0.2.jar
 | 
			
		||||
@see download https://pdfbox.apache.org/download.cgi
 | 
			
		||||
@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.2/
 | 
			
		||||
 | 
			
		||||
:)
 | 
			
		||||
module namespace pdfbox="org.apache.pdfbox.Loader";
 | 
			
		||||
 | 
			
		||||
declare namespace Loader ="java:org.apache.pdfbox.Loader"; 
 | 
			
		||||
declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper";
 | 
			
		||||
 | 
			
		||||
(:~ 
 | 
			
		||||
@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/PDDocument.html 
 | 
			
		||||
:)
 | 
			
		||||
declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument";
 | 
			
		||||
 | 
			
		||||
declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog";
 | 
			
		||||
declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels";
 | 
			
		||||
 | 
			
		||||
(:~ 
 | 
			
		||||
@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/multipdf/PageExtractor.html 
 | 
			
		||||
:)
 | 
			
		||||
declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor";
 | 
			
		||||
 
 | 
			
		||||
(:~ 
 | 
			
		||||
 @see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/PDPageTree.html
 | 
			
		||||
:)
 | 
			
		||||
declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree";
 | 
			
		||||
 | 
			
		||||
(:~ 
 | 
			
		||||
@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.2/org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDDocumentOutline.html 
 | 
			
		||||
:)
 | 
			
		||||
declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline";
 | 
			
		||||
 | 
			
		||||
declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation";
 | 
			
		||||
(:~ 
 | 
			
		||||
@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDOutlineItem.html 
 | 
			
		||||
:)
 | 
			
		||||
declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
 | 
			
		||||
declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
 | 
			
		||||
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
 | 
			
		||||
declare namespace File ="java:java.io.File";
 | 
			
		||||
 | 
			
		||||
(:~ version of pdfbox:)
 | 
			
		||||
declare function pdfbox:version()
 | 
			
		||||
as xs:string{
 | 
			
		||||
  Q{java:org.apache.pdfbox.util.Version}getVersion()
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ open pdf, returns handle :)
 | 
			
		||||
declare function pdfbox:open($pdfpath as xs:string)
 | 
			
		||||
as item(){
 | 
			
		||||
  Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath))
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ the PDF specification version this document conforms to.:)
 | 
			
		||||
declare function pdfbox:pdfVersion($doc as item())
 | 
			
		||||
as xs:float{
 | 
			
		||||
  PDDocument:getVersion($doc)
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ save pdf $doc to $savepath , returns $savepath :)
 | 
			
		||||
declare function pdfbox:save($doc as item(),$savepath as xs:string)
 | 
			
		||||
as xs:string{
 | 
			
		||||
   PDDocument:save($doc,File:new($savepath)),$savepath
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
declare function pdfbox:close($doc as item())
 | 
			
		||||
as empty-sequence(){
 | 
			
		||||
  (# db:wrapjava void #) {
 | 
			
		||||
     PDDocument:close($doc)
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
declare function pdfbox:page-count($doc as item())
 | 
			
		||||
as xs:integer{
 | 
			
		||||
  PDDocument:getNumberOfPages($doc)
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ map with document metadata :)
 | 
			
		||||
declare function pdfbox:information($doc as item())
 | 
			
		||||
as map(*){
 | 
			
		||||
  let $info:=PDDocument:getDocumentInformation($doc)
 | 
			
		||||
  return map{
 | 
			
		||||
    "title": PDDocumentInformation:getTitle($info),
 | 
			
		||||
    "creator": PDDocumentInformation:getCreator($info),
 | 
			
		||||
    "producer": PDDocumentInformation:getProducer($info),
 | 
			
		||||
    "subject": PDDocumentInformation:getSubject($info),
 | 
			
		||||
     "keywords": PDDocumentInformation:getKeywords($info),
 | 
			
		||||
     "creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate($info)),
 | 
			
		||||
    "author": PDDocumentInformation:getAuthor($info)
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 (:~ convert date :)
 | 
			
		||||
declare
 | 
			
		||||
function pdfbox:gregToISO($item as item())
 | 
			
		||||
as xs:string{
 | 
			
		||||
 Q{java:java.util.GregorianCalendar}toZonedDateTime($item)=>string()
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ outline for $doc as map()* :)
 | 
			
		||||
declare function pdfbox:outline($doc as item())
 | 
			
		||||
as map(*)*{
 | 
			
		||||
  (# db:wrapjava some #) {
 | 
			
		||||
  let $outline:=
 | 
			
		||||
                PDDocument:getDocumentCatalog($doc)
 | 
			
		||||
                =>PDDocumentCatalog:getDocumentOutline()
 | 
			
		||||
 
 | 
			
		||||
  return  if(exists($outline))
 | 
			
		||||
          then pdfbox:outline($doc,PDOutlineItem:getFirstChild($outline)) 
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ return bookmark info for children of $outlineItem as seq of maps :)
 | 
			
		||||
declare function pdfbox:outline($doc as item(),$outlineItem as item()?)
 | 
			
		||||
 | 
			
		||||
as map(*)*{
 | 
			
		||||
  let $find as map(*):=pdfbox:_outline($doc ,$outlineItem)
 | 
			
		||||
  return map:get($find,"list")
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(: BaseX bug 10.7? error if inlined in outline :)
 | 
			
		||||
declare function pdfbox:_outline($doc as item(),$outlineItem as item()?)
 | 
			
		||||
as map(*){
 | 
			
		||||
 hof:until(
 | 
			
		||||
            function($output) { empty($output?this) },
 | 
			
		||||
            function($input ) { 
 | 
			
		||||
                      let $bk:= pdfbox:bookmark($input?this,$doc)
 | 
			
		||||
                      let $bk:= if($bk?hasChildren)
 | 
			
		||||
                                then let $kids:=pdfbox:outline($doc,PDOutlineItem:getFirstChild($input?this))
 | 
			
		||||
                                     return map:merge(($bk,map:entry("children",$kids)))
 | 
			
		||||
                                else $bk 
 | 
			
		||||
                      return map{
 | 
			
		||||
                            "list": ($input?list, $bk),
 | 
			
		||||
                            "this":  PDOutlineItem:getNextSibling($input?this)}
 | 
			
		||||
                          },
 | 
			
		||||
            map{"list":(),"this":$outlineItem}
 | 
			
		||||
        ) 
 | 
			
		||||
};
 | 
			
		||||
(:~ outline as xml :)
 | 
			
		||||
declare function pdfbox:outline-xml($outline as map(*)*)
 | 
			
		||||
as element(outline){
 | 
			
		||||
 element outline { 
 | 
			
		||||
   $outline!pdfbox:bookmark-xml(.)
 | 
			
		||||
 }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
declare function pdfbox:bookmark-xml($outline as map(*)*)
 | 
			
		||||
as element(bookmark)*
 | 
			
		||||
{
 | 
			
		||||
  $outline!
 | 
			
		||||
  <bookmark title="{?title}" index="{?index}">
 | 
			
		||||
    {?children!pdfbox:bookmark-xml(.)}
 | 
			
		||||
  </bookmark>
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(: return bookmark info for children of $outlineItem :)
 | 
			
		||||
declare function pdfbox:bookmark($bookmark as item(),$doc as item())
 | 
			
		||||
as map(*)
 | 
			
		||||
{
 | 
			
		||||
 map{ 
 | 
			
		||||
  "index":  PDOutlineItem:findDestinationPage($bookmark,$doc)=>pdfbox:pageIndex($doc),
 | 
			
		||||
  "title":  (# db:checkstrings #) {PDOutlineItem:getTitle($bookmark)}=>translate("<22>",""),
 | 
			
		||||
  "hasChildren": PDOutlineItem:hasChildren($bookmark)
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
declare function pdfbox:outx($page ,$document)
 | 
			
		||||
{
 | 
			
		||||
  let $currentPage := PDOutlineItem:findDestinationPage($page,$document)
 | 
			
		||||
  let $pageNumber := pdfbox:pageIndex($currentPage,$document)
 | 
			
		||||
  return $pageNumber
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ pageIndex of $page in $document :)
 | 
			
		||||
declare function pdfbox:pageIndex(
 | 
			
		||||
   $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :),
 | 
			
		||||
   $document)
 | 
			
		||||
as item()?
 | 
			
		||||
{
 | 
			
		||||
  if(exists($page))
 | 
			
		||||
  then PDDocument:getDocumentCatalog($document)
 | 
			
		||||
      =>PDDocumentCatalog:getPages()
 | 
			
		||||
      =>PDPageTree:indexOf($page)
 | 
			
		||||
};            
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
(:~ save new PDF doc from 1 based page range 
 | 
			
		||||
@return save path :)
 | 
			
		||||
declare function pdfbox:extract($doc as item(), 
 | 
			
		||||
             $start as xs:integer,$end as xs:integer,$target as xs:string)
 | 
			
		||||
as xs:string
 | 
			
		||||
{
 | 
			
		||||
    let $a:=PageExtractor:new($doc, $start, $end) =>PageExtractor:extract()
 | 
			
		||||
    return (pdfbox:save($a,$target),pdfbox:close($a)) 
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
(:~   pageLabel info
 | 
			
		||||
@see https://www.w3.org/TR/WCAG20-TECHS/PDF17.html#PDF17-examples
 | 
			
		||||
@see https://codereview.stackexchange.com/questions/286078/java-code-showing-page-labels-from-pdf-files
 | 
			
		||||
:)
 | 
			
		||||
declare function pdfbox:getPageLabels($doc as item())
 | 
			
		||||
as item()
 | 
			
		||||
{
 | 
			
		||||
  PDDocument:getDocumentCatalog($doc)
 | 
			
		||||
  =>PDDocumentCatalog:getPageLabels()
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~   pageLabel for every page:)
 | 
			
		||||
declare function pdfbox:pageLabels($doc as item())
 | 
			
		||||
as xs:string*
 | 
			
		||||
{
 | 
			
		||||
  PDDocument:getDocumentCatalog($doc)
 | 
			
		||||
  =>PDDocumentCatalog:getPageLabels()
 | 
			
		||||
  =>PDPageLabels:getLabelsByPageIndices()
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ return text on $pageNo :)
 | 
			
		||||
declare function pdfbox:getText($doc as item(), $pageNo as xs:integer)
 | 
			
		||||
as xs:string{
 | 
			
		||||
  let $tStripper := (# db:wrapjava instance #) {
 | 
			
		||||
         PDFTextStripper:new()
 | 
			
		||||
         => PDFTextStripper:setStartPage($pageNo)
 | 
			
		||||
         => PDFTextStripper:setEndPage($pageNo)
 | 
			
		||||
       }
 | 
			
		||||
  return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ summary info as map for $pdfpath :)
 | 
			
		||||
declare function pdfbox:report($pdfpath as xs:string)
 | 
			
		||||
as map(*){
 | 
			
		||||
 let $doc:=pdfbox:open($pdfpath)
 | 
			
		||||
 return (map{
 | 
			
		||||
       "file":  $pdfpath,
 | 
			
		||||
       "pages": pdfbox:page-count($doc),
 | 
			
		||||
       "outline": pdfbox:outline($doc)=>count()
 | 
			
		||||
        },pdfbox:information($doc)
 | 
			
		||||
)=>map:merge()
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ java:bufferedImage for $pageNo using $scale times dpi= 72
 | 
			
		||||
@param $pageNo (ZERO based) 
 | 
			
		||||
@param $scale 1=72 dpi 
 | 
			
		||||
@return  Java java.awt.image.BufferedImage object
 | 
			
		||||
:)
 | 
			
		||||
declare function pdfbox:pageBufferedImage($doc as item(), $pageNo as xs:integer,$scale as xs:float)
 | 
			
		||||
as item(){
 | 
			
		||||
 PDFRenderer:new($doc)=>PDFRenderer:renderImage($pageNo,$scale)
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ save bufferedimage to $dest 
 | 
			
		||||
@param $type = "gif","png" etc:)
 | 
			
		||||
declare function pdfbox:imageSave($bufferedImage as item(),$dest as xs:string,$type as xs:string)
 | 
			
		||||
as xs:boolean{
 | 
			
		||||
  Q{java:javax.imageio.ImageIO}write($bufferedImage , $type,  File:new($dest))
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ return image 
 | 
			
		||||
@param $type = "gif","png" etc:)
 | 
			
		||||
declare function pdfbox:imageBinary($bufferedImage as item(),$type as xs:string)
 | 
			
		||||
as xs:base64Binary{
 | 
			
		||||
  let $bytes:=Q{java:java.io.ByteArrayOutputStream}new()
 | 
			
		||||
  let $_:=Q{java:javax.imageio.ImageIO}write($bufferedImage , $type,  $bytes)
 | 
			
		||||
  return Q{java:java.io.ByteArrayOutputStream}toByteArray($bytes)
 | 
			
		||||
         =>convert:integers-to-base64()
 | 
			
		||||
};
 | 
			
		||||
							
								
								
									
										
											BIN
										
									
								
								lib/pdfbox-3.0.3.fat.jar
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								lib/pdfbox-3.0.3.fat.jar
									
										
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										51
									
								
								scripts/build.xqm
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								scripts/build.xqm
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,51 @@
 | 
			
		|||
(:~ build utils for REPO packaging :)
 | 
			
		||||
module namespace build = 'urn:quodatum:build1';
 | 
			
		||||
 | 
			
		||||
(:~ create a flat fat jar :)
 | 
			
		||||
declare function build:fatjar-from-folder($input-dir as xs:string,$manifest-jar as xs:string)
 | 
			
		||||
as xs:base64Binary { 
 | 
			
		||||
    let $fold :=
 | 
			
		||||
function ($res as map (*), $jar as xs:string) { 
 | 
			
		||||
    let $bin :=file:read-binary($input-dir || $jar),
 | 
			
		||||
        $paths := archive:entries($bin)/string()
 | 
			
		||||
        [$jar eq $manifest-jar or not(starts-with( .,"META-INF/"))]
 | 
			
		||||
    return
 | 
			
		||||
        map { "name" : ($res? name, $paths), 
 | 
			
		||||
              "content" : ($res? content,archive:extract-binary($bin, $paths)) } 
 | 
			
		||||
}
 | 
			
		||||
let $res := fold-left(file:list($input-dir, false(), "*.jar"), map { }, $fold)
 | 
			
		||||
return
 | 
			
		||||
    archive:create($res? name, $res? content,
 | 
			
		||||
                   map { "format" : "zip", "algorithm" : "deflate" }) 
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ create a fat jar with lib :)
 | 
			
		||||
declare function build:fatjar-with-lib($input-dir as xs:string,$manifest-jar as xs:string)
 | 
			
		||||
 { 
 | 
			
		||||
 let $bin :=file:read-binary($input-dir || $manifest-jar)
 | 
			
		||||
  
 | 
			
		||||
 let $lib:=file:list($input-dir || "lib/", false(), "*.jar")!concat("lib/",.)
 | 
			
		||||
 let $name:= (archive:entries($bin)/string()
 | 
			
		||||
              ,$lib)
 | 
			
		||||
 let  $content:=(archive:extract-binary($bin,$name)
 | 
			
		||||
                ,$lib!file:read-binary($input-dir || .))
 | 
			
		||||
return  archive:create($name, $content,
 | 
			
		||||
                   map { "format" : "zip", "algorithm" : "deflate" }) 
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ update-manifest :)
 | 
			
		||||
declare function build:update-manifest($jar  as xs:base64Binary,$main-class as xs:string)
 | 
			
		||||
as xs:base64Binary{
 | 
			
		||||
(: let $mf:=archive:extract-text($jar,"META-INF/MANIFEST.MF") :)
 | 
			
		||||
 | 
			
		||||
let $mf2:=concat("Manifest-Version: 1.0
Main-Class: ",
 | 
			
		||||
                 $main-class,
 | 
			
		||||
                 "

")
 | 
			
		||||
return archive:update($jar,"META-INF/MANIFEST.MF",$mf2)
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ update-manifest :)
 | 
			
		||||
declare function build:update($jar as xs:base64Binary,$name  as xs:string,$file as xs:string)
 | 
			
		||||
as xs:base64Binary{
 | 
			
		||||
archive:update($jar,$name,$file)
 | 
			
		||||
}; 
 | 
			
		||||
							
								
								
									
										23
									
								
								scripts/make-fat-jar.xq
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								scripts/make-fat-jar.xq
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,23 @@
 | 
			
		|||
 | 
			
		||||
import module namespace build = 'urn:quodatum:build1' at 'build.xqm';
 | 
			
		||||
 | 
			
		||||
(: Main execution
 | 
			
		||||
Main-Class: org.basex.modules.Hello
 | 
			
		||||
 :)
 | 
			
		||||
let $config :=map { 
 | 
			
		||||
         "manifest-jar" : "pdfbox-3.0.3.jar", 
 | 
			
		||||
         "input-dir" :  "C:\Users\mrwhe\git\expkg-zone58\pdfbox\jars\", 
 | 
			
		||||
         "output" :  "../lib/pdfbox-3.0.3.fat.jar",
 | 
			
		||||
         "main-class": "org.apache.pdfbox.Loader" 
 | 
			
		||||
         }
 | 
			
		||||
 | 
			
		||||
let $fat-jar := build:fatjar-with-lib($config?input-dir,$config?manifest-jar)
 | 
			
		||||
 | 
			
		||||
let $fat-jar:=build:update-manifest($fat-jar, $config?main-class)
 | 
			
		||||
let $name:=replace($config?main-class,"\.","/") || ".xqm"
 | 
			
		||||
let $content:=file:read-binary($config?input-dir || "loader.xqm")
 | 
			
		||||
let $fat-jar:=archive:update($fat-jar, $name,$content)
 | 
			
		||||
let $output-file := file:resolve-path($config?output, $config?input-dir)
 | 
			
		||||
return (file:write-binary($output-file, $fat-jar),
 | 
			
		||||
        trace($output-file,"fat jar: "))
 | 
			
		||||
  
 | 
			
		||||
| 
						 | 
				
			
			@ -2,7 +2,7 @@
 | 
			
		|||
 | 
			
		||||
 :)
 | 
			
		||||
module namespace test="urn:expkg-zone58:pdfbox3:tests";
 | 
			
		||||
import module namespace pdfbox="urn:expkg-zone58:pdfbox3" at "../lib/pdfbox3.xqm";
 | 
			
		||||
import module namespace pdfbox="org.apache.pdfbox.Loader";
 | 
			
		||||
 | 
			
		||||
declare variable $test:base:=file:base-dir()=>file:parent()=>file:parent();
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue