[mod] lost
This commit is contained in:
		
							parent
							
								
									38d12d91c1
								
							
						
					
					
						commit
						d37f923d09
					
				
					 8 changed files with 106 additions and 41 deletions
				
			
		| 
						 | 
				
			
			@ -1,9 +1,11 @@
 | 
			
		|||
# Pdfbox
 | 
			
		||||
A BaseX interface for [Pdfbox](https://pdfbox.apache.org/) version 3. 
 | 
			
		||||
It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7
 | 
			
		||||
It is packaged using the [Expath](https://docs.basex.org/main/Repository#expath_packaging) format, and is tested against BaseX 10.7 and 11.7.
 | 
			
		||||
 | 
			
		||||
* The Pdfbox 3 [FAQ](https://pdfbox.apache.org/3.0/faq.html) may be useful.
 | 
			
		||||
## Features
 | 
			
		||||
* read PDF page count.
 | 
			
		||||
* read any PDF outline and return as maps or XML.
 | 
			
		||||
* read any PDF outline and return as map(s) or XML.
 | 
			
		||||
* read pagelabels.
 | 
			
		||||
* read page text.
 | 
			
		||||
* save pdf page range to a new pdf.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										15
									
								
								package.json
									
										
									
									
									
								
							
							
						
						
									
										15
									
								
								package.json
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -1,13 +1,15 @@
 | 
			
		|||
{
 | 
			
		||||
  "name": "pdfbox",
 | 
			
		||||
  "version": "1.0.0",
 | 
			
		||||
  "version": "0.1.1",
 | 
			
		||||
  "description": "A BaseX interface to Apache Pdfbox version 3",
 | 
			
		||||
  "main": "index.js",
 | 
			
		||||
  "main": "Pdfbox.xqm",
 | 
			
		||||
  "homepage": "https://github.com/npm/example#readme",
 | 
			
		||||
  "directories": {
 | 
			
		||||
    "doc": "docs"
 | 
			
		||||
  },
 | 
			
		||||
  "scripts": {
 | 
			
		||||
    "test": "%BASEX10%/bin/basex -t src/test"
 | 
			
		||||
    "test": "%BASEX10%/bin/basex -t src/test",
 | 
			
		||||
    "docs": "xqdoca"
 | 
			
		||||
  },
 | 
			
		||||
  "keywords": [
 | 
			
		||||
    "pdf",
 | 
			
		||||
| 
						 | 
				
			
			@ -16,5 +18,8 @@
 | 
			
		|||
    "java"
 | 
			
		||||
  ],
 | 
			
		||||
  "author": "Andy Bunce",
 | 
			
		||||
  "license": "Apache-2.0"
 | 
			
		||||
}
 | 
			
		||||
  "license": "Apache-2.0",
 | 
			
		||||
  "quodatum": {
 | 
			
		||||
    "random": true
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										17
									
								
								package.xml
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								package.xml
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,17 @@
 | 
			
		|||
<package  name="org.expkg_zone58.Pdfbox3"
 | 
			
		||||
         abbrev="pdfbox"
 | 
			
		||||
         version="0.1.1"
 | 
			
		||||
         spec="1.0">
 | 
			
		||||
    <component name="pdfbox-3.0.4.jar">
 | 
			
		||||
        <source type="maven">org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar</source>
 | 
			
		||||
    </component>
 | 
			
		||||
    <component name="pdfbox-io-3.0.4.jar">
 | 
			
		||||
        <source type="maven">org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar</source>
 | 
			
		||||
    </component>
 | 
			
		||||
    <component name="fontbox-3.0.4.jar">
 | 
			
		||||
        <source type="maven">org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar</source>
 | 
			
		||||
    </component>
 | 
			
		||||
    <component name="commons-logging-1.3.4.jar">
 | 
			
		||||
        <source type="maven">commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar</source>
 | 
			
		||||
    </component>
 | 
			
		||||
</package>
 | 
			
		||||
| 
						 | 
				
			
			@ -4,6 +4,8 @@ module namespace build = 'urn:quodatum:build1';
 | 
			
		|||
(:~ create a flat fat jar from jars in $input-dir
 | 
			
		||||
keeping only META-INF from $manifest-jar 
 | 
			
		||||
:)
 | 
			
		||||
declare variable $build:archive-opts:= map { "format" : "zip", "algorithm" : "deflate" };
 | 
			
		||||
 | 
			
		||||
declare function build:fatjar-from-folder($input-dir as xs:string,$manifest-jar as xs:string)
 | 
			
		||||
as xs:base64Binary { 
 | 
			
		||||
    let $fold :=
 | 
			
		||||
| 
						 | 
				
			
			@ -17,9 +19,7 @@ function ($res as map (*), $jar as xs:string) {
 | 
			
		|||
}
 | 
			
		||||
let $res := file:list($input-dir, false(), "*.jar")
 | 
			
		||||
            =>fold-left( map { }, $fold)
 | 
			
		||||
return
 | 
			
		||||
    archive:create($res? name, $res? content,
 | 
			
		||||
                   map { "format" : "zip", "algorithm" : "deflate" }) 
 | 
			
		||||
return archive:create($res? name, $res? content,$build:archive-opts) 
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ create a fat jar with lib 
 | 
			
		||||
| 
						 | 
				
			
			@ -34,8 +34,7 @@ declare function build:fatjar-with-lib($input-dir as xs:string,$manifest-jar as
 | 
			
		|||
              ,$lib)
 | 
			
		||||
 let  $content:=(archive:extract-binary($bin,$name)
 | 
			
		||||
                ,$lib!file:read-binary($input-dir || .))
 | 
			
		||||
return  archive:create($name, $content,
 | 
			
		||||
                   map { "format" : "zip", "algorithm" : "deflate" }) 
 | 
			
		||||
return  archive:create($name, $content,$build:archive-opts)
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ update-manifest :)
 | 
			
		||||
| 
						 | 
				
			
			@ -61,15 +60,13 @@ as xs:base64Binary{
 | 
			
		|||
            build:xar-add(map{},file:resolve-path("jars/",$base),"content/")
 | 
			
		||||
            =>build:xar-add(file:resolve-path("src/Pdfbox3.xqm",$base),"content/")
 | 
			
		||||
            =>build:xar-add(file:resolve-path("src/metadata/",$base),"")
 | 
			
		||||
  return  archive:create($entries?name, $entries?content,
 | 
			
		||||
                   map { "format" : "zip", "algorithm" : "deflate" })         
 | 
			
		||||
  return  archive:create($entries?name, $entries?content,$build:archive-opts)      
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ zip data for $dir
 | 
			
		||||
:)
 | 
			
		||||
declare function build:xar-add($map as map(*),$src as xs:string,$xar-dir as xs:string)
 | 
			
		||||
as map(*){
 | 
			
		||||
let $_:=trace(count($map?name),"size ")
 | 
			
		||||
let $names:=if(file:is-dir($src))
 | 
			
		||||
            then file:list($src)[not(starts-with(.,'.'))]!concat($src,.)
 | 
			
		||||
            else $src
 | 
			
		||||
| 
						 | 
				
			
			@ -95,7 +92,7 @@ as empty-sequence(){
 | 
			
		|||
};
 | 
			
		||||
 | 
			
		||||
(:~ write-binary, creating dir if required :)
 | 
			
		||||
declare function build:write-binary($dest as xs:string,$contents)
 | 
			
		||||
declare function build:write-binary($dest as xs:string,$contents as xs:base64Binary?)
 | 
			
		||||
as empty-sequence(){
 | 
			
		||||
file:create-dir(file:parent($dest)),
 | 
			
		||||
file:write-binary($dest,$contents)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -13,4 +13,4 @@ let $_:=build:maven-download($maven-urls,$base || "jars/")
 | 
			
		|||
let $xar:=build:xar-create($base)
 | 
			
		||||
let $output-file := file:resolve-path("dist/pdfbox.xar",$base)
 | 
			
		||||
return (build:write-binary($output-file, $xar),
 | 
			
		||||
        trace($output-file,"zar: "))
 | 
			
		||||
        trace($output-file,"xar: "))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										22
									
								
								scripts/maven.xqm
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								scripts/maven.xqm
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,22 @@
 | 
			
		|||
(:~  maven access
 | 
			
		||||
 :
 | 
			
		||||
 ::)
 | 
			
		||||
module namespace mvn = 'urn:quodatum:maven:1';
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
declare variable $mvn:example := <dependency>
 | 
			
		||||
  <groupId>org.ccil.cowan.tagsoup</groupId>
 | 
			
		||||
  <artifactId>tagsoup</artifactId>
 | 
			
		||||
  <version>1.2.1</version>
 | 
			
		||||
</dependency>;
 | 
			
		||||
 | 
			
		||||
declare function mvn:url($dep as element(dependency),$ext as xs:string)
 | 
			
		||||
as xs:string { 
 | 
			
		||||
 | 
			
		||||
    string-join(
 | 
			
		||||
        ("https://repo.maven.apache.org/maven2/",
 | 
			
		||||
          string-join($dep/*/string(), "/"),
 | 
			
		||||
          "/",$dep/artifactId, "-", $dep/version, ".",$ext
 | 
			
		||||
          ))
 | 
			
		||||
 };
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -4,8 +4,8 @@ pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library,
 | 
			
		|||
requires pdfbox jar on classpath, tested with pdfbox-app-3.0.4.jar
 | 
			
		||||
@see download https://pdfbox.apache.org/download.cgi
 | 
			
		||||
@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.4/
 | 
			
		||||
 | 
			
		||||
:)
 | 
			
		||||
 | 
			
		||||
module namespace pdfbox="org.expkg_zone58.Pdfbox3";
 | 
			
		||||
 | 
			
		||||
declare namespace Loader ="java:org.apache.pdfbox.Loader"; 
 | 
			
		||||
| 
						 | 
				
			
			@ -28,21 +28,30 @@ declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
 | 
			
		|||
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
 | 
			
		||||
declare namespace File ="java:java.io.File";
 | 
			
		||||
 | 
			
		||||
declare variable $pdfbox:package-version:="0.1.1";
 | 
			
		||||
 | 
			
		||||
(:~ SemVer version of this package 
 | 
			
		||||
with build metadata for Apacke Pdfbox in use  e.g. "0.1.0+pdfbox3.0.4"
 | 
			
		||||
with build metadata for Apache Pdfbox in use  e.g. "0.1.0+pdfbox3.0.4"
 | 
			
		||||
:)
 | 
			
		||||
declare function pdfbox:version()
 | 
			
		||||
as xs:string{
 | 
			
		||||
  "0.1.0+pdfbox" || Q{java:org.apache.pdfbox.util.Version}getVersion()
 | 
			
		||||
  $pdfbox:package-version ||"+pdfbox" || Q{java:org.apache.pdfbox.util.Version}getVersion()
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(: open pdf,apply function, close pdf
 | 
			
		||||
with-document pattern, creates local pdfobject and ensures it is closed
 | 
			
		||||
e.g "path..." => pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
 | 
			
		||||
(:~ with-document pattern: open pdf,apply function, close pdf
 | 
			
		||||
 creates a local pdfobject and ensures it is closed after use
 | 
			
		||||
e.g pdfbox:with-pdf("path...",pdfbox:page-text(?,5))
 | 
			
		||||
:)
 | 
			
		||||
declare function pdfbox:with-pdf($src as xs:string,$fn as function(*)*)
 | 
			
		||||
declare function pdfbox:with-pdf($src as xs:string,
 | 
			
		||||
                                $fn as function(item())as item()*)
 | 
			
		||||
as item()*{
 | 
			
		||||
 "@TODO"
 | 
			
		||||
 let $pdf:=pdfbox:open($src)
 | 
			
		||||
 return try{
 | 
			
		||||
        $fn($pdf),pdfbox:close($pdf)
 | 
			
		||||
        } catch *{
 | 
			
		||||
          pdfbox:close($pdf),error()
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ open pdf, returns pdf object :)
 | 
			
		||||
| 
						 | 
				
			
			@ -63,7 +72,7 @@ as xs:string{
 | 
			
		|||
 PDDocument:getVersion($pdf)=>xs:decimal()=>round(4)=>string()
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ save pdf $pdf to $savepath , returns $savepath :)
 | 
			
		||||
(:~ save pdf $pdf to filesystem at $savepath , returns $savepath :)
 | 
			
		||||
declare function pdfbox:save($pdf as item(),$savepath as xs:string)
 | 
			
		||||
as xs:string{
 | 
			
		||||
   PDDocument:save($pdf, File:new($savepath)),$savepath
 | 
			
		||||
| 
						 | 
				
			
			@ -97,7 +106,7 @@ as xs:base64Binary{
 | 
			
		|||
};
 | 
			
		||||
 | 
			
		||||
(:~ map with document metadata :)
 | 
			
		||||
declare function pdfbox:information($pdf as item())
 | 
			
		||||
declare function pdfbox:metadata($pdf as item())
 | 
			
		||||
as map(*){
 | 
			
		||||
  let $info:=PDDocument:getDocumentInformation($pdf)
 | 
			
		||||
  return map{
 | 
			
		||||
| 
						 | 
				
			
			@ -105,13 +114,36 @@ as map(*){
 | 
			
		|||
    "creator": PDDocumentInformation:getCreator($info),
 | 
			
		||||
    "producer": PDDocumentInformation:getProducer($info),
 | 
			
		||||
    "subject": PDDocumentInformation:getSubject($info),
 | 
			
		||||
     "keywords": PDDocumentInformation:getKeywords($info),
 | 
			
		||||
     "creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate($info)),
 | 
			
		||||
    "keywords": PDDocumentInformation:getKeywords($info),
 | 
			
		||||
    "creationdate": pdfbox:gregToISO(PDDocumentInformation:getCreationDate($info)),
 | 
			
		||||
    "author": PDDocumentInformation:getAuthor($info)
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ summary info as map for $pdfpath :)
 | 
			
		||||
declare function pdfbox:report($pdfpath as xs:string)
 | 
			
		||||
as map(*){
 | 
			
		||||
 let $pdf:=pdfbox:open($pdfpath)
 | 
			
		||||
 return (map{
 | 
			
		||||
       "file":  $pdfpath,
 | 
			
		||||
       "pages": pdfbox:page-count($pdf),
 | 
			
		||||
       "hasOutline": pdfbox:hasOutline($pdf),
 | 
			
		||||
       "specification":pdfbox:specification($pdf)
 | 
			
		||||
        },pdfbox:metadata($pdf)
 | 
			
		||||
)=>map:merge()
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 (:~ true if $pdf has an outline for $pdf as map()* :)
 | 
			
		||||
declare function pdfbox:hasOutline($pdf as item())
 | 
			
		||||
as xs:boolean{
 | 
			
		||||
  (# db:wrapjava some #) {
 | 
			
		||||
  let $outline:=
 | 
			
		||||
                PDDocument:getDocumentCatalog($pdf)
 | 
			
		||||
                =>PDDocumentCatalog:getDocumentOutline()
 | 
			
		||||
 
 | 
			
		||||
  return  exists($outline)
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ outline for $pdf as map()* :)
 | 
			
		||||
declare function pdfbox:outline($pdf as item())
 | 
			
		||||
| 
						 | 
				
			
			@ -236,17 +268,7 @@ as xs:string{
 | 
			
		|||
  return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
(:~ summary info as map for $pdfpath :)
 | 
			
		||||
declare function pdfbox:report($pdfpath as xs:string)
 | 
			
		||||
as map(*){
 | 
			
		||||
 let $doc:=pdfbox:open($pdfpath)
 | 
			
		||||
 return (map{
 | 
			
		||||
       "file":  $pdfpath,
 | 
			
		||||
       "pages": pdfbox:page-count($doc),
 | 
			
		||||
       "outline": pdfbox:outline($doc)=>count()
 | 
			
		||||
        },pdfbox:information($doc)
 | 
			
		||||
)=>map:merge()
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
(:~ convert date :)
 | 
			
		||||
declare %private
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
<package xmlns="http://expath.org/ns/pkg"
 | 
			
		||||
         name="org.expkg_zone58.Pdfbox3"
 | 
			
		||||
         abbrev="pdfbox"
 | 
			
		||||
         version="0.1.0"
 | 
			
		||||
         version="0.1.1"
 | 
			
		||||
         spec="1.0">
 | 
			
		||||
 | 
			
		||||
   <title>BaseX  interface to Pdfbox (https://pdfbox.apache.org/) version 3</title>
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue