diff --git a/.gitignore b/.gitignore index 35f13f1..0a1d31c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ data/ +dist/ docs/xqdoc/ \ No newline at end of file diff --git a/jars/commons-logging-1.3.4.jar b/jars/commons-logging-1.3.4.jar deleted file mode 100644 index b6339bb..0000000 Binary files a/jars/commons-logging-1.3.4.jar and /dev/null differ diff --git a/jars/fontbox-3.0.4.jar b/jars/fontbox-3.0.4.jar deleted file mode 100644 index d814425..0000000 Binary files a/jars/fontbox-3.0.4.jar and /dev/null differ diff --git a/jars/pdfbox-3.0.4.jar b/jars/pdfbox-3.0.4.jar deleted file mode 100644 index bf0a91e..0000000 Binary files a/jars/pdfbox-3.0.4.jar and /dev/null differ diff --git a/jars/pdfbox-io-3.0.4.jar b/jars/pdfbox-io-3.0.4.jar deleted file mode 100644 index 83a1093..0000000 Binary files a/jars/pdfbox-io-3.0.4.jar and /dev/null differ diff --git a/lib/pdfbox-3.0.4.fat.jar b/lib/pdfbox-3.0.4.fat.jar deleted file mode 100644 index 66f8833..0000000 Binary files a/lib/pdfbox-3.0.4.fat.jar and /dev/null differ diff --git a/scripts/download.xq b/scripts/download.xq new file mode 100644 index 0000000..3004e5f --- /dev/null +++ b/scripts/download.xq @@ -0,0 +1,17 @@ + +import module namespace build = 'urn:quodatum:build1' at 'build.xqm'; + +declare variable $files := ( +"https://repo1.maven.org/maven2/org/apache/pdfbox/pdfbox/3.0.4/pdfbox-3.0.4.jar", +"https://repo1.maven.org/maven2/org/apache/pdfbox/pdfbox-io/3.0.4/pdfbox-io-3.0.4.jar", +"https://repo1.maven.org/maven2/org/apache/pdfbox/fontbox/3.0.4/fontbox-3.0.4.jar", +"https://repo1.maven.org/maven2/commons-logging/commons-logging/1.3.4/commons-logging-1.3.4.jar" +); + + + +let $base:= file:resolve-path("../",static-base-uri()) +let $target:=file:resolve-path("jars/",$base) +for $f in $files +let $n:=replace($f,"^.*/","") =>trace("N") +return file:write-binary($target || $n, fetch:binary($f)) \ No newline at end of file diff --git a/scripts/make-fat-jar.xq b/scripts/make-fat-jar.xq index 7c52f08..51b8b16 100644 --- a/scripts/make-fat-jar.xq +++ b/scripts/make-fat-jar.xq @@ -8,7 +8,7 @@ let $config :=map { "base": file:resolve-path("../",static-base-uri()), "manifest-jar" : "pdfbox-3.0.4.jar", "input-dir" : "jars/", - "output" : "lib/pdfbox-3.0.4.fat.jar", + "output" : "dist/pdfbox-3.0.4.fat.jar", "main-class": "org.expkg_zone58.Pdfbox3" } let $jar-path:=file:resolve-path($config?input-dir,$config?base=>trace("base "))=>trace("jar: ") @@ -16,7 +16,7 @@ let $fat-jar := build:fatjar-from-folder($jar-path,$config?manifest-jar) let $fat-jar:=build:update-manifest($fat-jar, $config?main-class) let $name:=replace($config?main-class,"\.","/") || ".xqm" -let $content:=file:read-binary($jar-path || "Pdfbox3.xqm") +let $content:=file:read-binary($config?base || "src/Pdfbox3.xqm") let $fat-jar:=archive:update($fat-jar, $name,$content) let $output-file := file:resolve-path($config?output,$config?base) return (file:write-binary($output-file, $fat-jar), diff --git a/jars/Pdfbox3.xqm b/src/Pdfbox3.xqm similarity index 90% rename from jars/Pdfbox3.xqm rename to src/Pdfbox3.xqm index b52353c..a14a01a 100644 --- a/jars/Pdfbox3.xqm +++ b/src/Pdfbox3.xqm @@ -1,9 +1,9 @@ xquery version '3.1'; (:~ pdfbox 3.0 https://pdfbox.apache.org/ BaseX 10.7+ interface library, -requires pdfbox jar on classpath, tested with pdfbox-app-3.0.3.jar +requires pdfbox jar on classpath, tested with pdfbox-app-3.0.4.jar @see download https://pdfbox.apache.org/download.cgi -@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.3/ +@javadoc https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.4/ :) module namespace pdfbox="org.expkg_zone58.Pdfbox3"; @@ -11,32 +11,26 @@ module namespace pdfbox="org.expkg_zone58.Pdfbox3"; declare namespace Loader ="java:org.apache.pdfbox.Loader"; declare namespace PDFTextStripper = "java:org.apache.pdfbox.text.PDFTextStripper"; -(:~ -@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/PDDocument.html -:) +(:~ @javadoc org/apache/pdfbox/pdmodel/PDDocument.html :) declare namespace PDDocument ="java:org.apache.pdfbox.pdmodel.PDDocument"; declare namespace PDDocumentCatalog ="java:org.apache.pdfbox.pdmodel.PDDocumentCatalog"; declare namespace PDPageLabels ="java:org.apache.pdfbox.pdmodel.common.PDPageLabels"; -(:~ -@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/multipdf/PageExtractor.html -:) +(:~ @javadoc org/apache/pdfbox/multipdf/PageExtractor.html :) declare namespace PageExtractor ="java:org.apache.pdfbox.multipdf.PageExtractor"; -(:~ - @see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/PDPageTree.html -:) +(:~ @javadoc org/apache/pdfbox/pdmodel/PDPageTree.html :) declare namespace PDPageTree ="java:org.apache.pdfbox.pdmodel.PDPageTree"; (:~ -@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.2/org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDDocumentOutline.html +@javadoc org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDDocumentOutline.html :) declare namespace PDDocumentOutline ="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline"; declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocumentInformation"; (:~ -@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDOutlineItem.html +@javadoc org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDOutlineItem.html :) declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem"; declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer"; @@ -55,7 +49,7 @@ as item(){ Loader:loadPDF( RandomAccessReadBufferedFile:new($pdfpath)) }; -(:~ the PDF specification version this $pdf conforms to.:) +(:~ the version of the PDF specification used by $pdf :) declare function pdfbox:pdfVersion($pdf as item()) as xs:float{ PDDocument:getVersion($pdf) @@ -179,14 +173,14 @@ declare function pdfbox:outx($page ,$document) return $pageNumber }; -(:~ pageIndex of $page in $document :) +(:~ pageIndex of $page in $pdf :) declare function pdfbox:pageIndex( $page as item()? (: as java:org.apache.pdfbox.pdmodel.PDPage :), - $document) + $pdf) as item()? { if(exists($page)) - then PDDocument:getDocumentCatalog($document) + then PDDocument:getDocumentCatalog($pdf) =>PDDocumentCatalog:getPages() =>PDPageTree:indexOf($page) }; diff --git a/src/lib/abc.xqm b/src/lib/abc.xqm deleted file mode 100644 index a7fa812..0000000 --- a/src/lib/abc.xqm +++ /dev/null @@ -1,5 +0,0 @@ -xquery version '3.1'; -(:~ look for pagenos in pdf text -pagenos:page-report($doc )=>pagenos:inverted-map() -:) -module namespace pagenos = 'urn:pageno'; \ No newline at end of file diff --git a/src/lib/pdfscrape.xqm b/src/lib/pdfscrape.xqm index 9304570..5088ec2 100644 --- a/src/lib/pdfscrape.xqm +++ b/src/lib/pdfscrape.xqm @@ -3,7 +3,7 @@ xquery version '3.1'; pdfscrape:page-report($doc )=>pdfscrape:inverted-map() :) module namespace pdfscrape = 'urn:pdfscrape'; -import module namespace pdfbox="urn:expkg-zone58:pdfbox3" at "pdfbox3.xqm"; +import module namespace pdfbox="org.expkg_zone58.Pdfbox3" ; (:~ page number regex @todo last line and roman diff --git a/src/scratch/abc.xq b/src/scratch/abc.xq deleted file mode 100644 index ec55204..0000000 --- a/src/scratch/abc.xq +++ /dev/null @@ -1,26 +0,0 @@ -(: test use of pageIndex :) -import module namespace pdfbox="urn:expkg-zone58:pdfbox3" at "../lib/pdfbox3.xqm"; - -declare variable $base:=file:base-dir(); -declare function local:go($doc,$pdf as element(pdf)){ - let $range:=$pdf/@pages/tokenize(.,"–") - let $start:=$range[1] - let $end:=if(count($range) eq 1) then $range[1] else $range[2] - - return ``[ `{$start}` ;;; `{ $end }` ]`` -}; -let $src:="257107---Book_File-Web_PDF_9798400691218_486731.pdf"=>file:resolve-path($base) -let $doc:=pdfbox:open($src) -let $labels:= pdfbox:pageLabels($doc) -let $pdfs:=doc("pdfs\chunks-docbook.xml")/chunks/pdf -for $pdf in $pdfs -let $range:=$pdf/@pages/tokenize(.,"–") -let $start:=$range[1] -let $end:=if(count($range) eq 1) then $range[1] else $range[2] -let $startIndex:=index-of($labels,$start) -let $endIndex:=index-of($labels,$end) -return if(exists($startIndex) and exists($endIndex)) - then $pdf/@pages || " " || $startIndex || ":" || $endIndex - (: pdfbox:extract($doc,$startIndex,$endIndex,file:resolve-path($pdf/@fileref,$base)) :) - else $pdf/@pages - diff --git a/src/scratch/abc2.xq b/src/scratch/abc2.xq deleted file mode 100644 index 0bf0494..0000000 --- a/src/scratch/abc2.xq +++ /dev/null @@ -1,8 +0,0 @@ -declare variable $samples:= map{ - "climate": "data\drop-01d\set\2-6-1\A5579C_1\271989---Book_File-Web_PDF_9798400627484_486728.pdf", - "women": "data\drop-01d\set\2-6-1\A6229C_1\257334---Book_File-Web_PDF_9798216172628_486742.pdf", - "genocide": "data\drop1-pdf\GR2967-TRD\272791---Book_File-Web_PDF_9798400640216_486366.pdf", - "world": "data\drop-01c\gpg-book\2-6\A3506C-TRD\256186---Book_File-Web_PDF_9798216038955_486148.pdf" -}; -declare variable $base:= "C:\Users\mrwhe\git\bloomsbury\content-architecture\xquery\ABC-CLIO\data"; -42 \ No newline at end of file diff --git a/src/scratch/nos.xq b/src/scratch/nos.xq deleted file mode 100644 index 52f83c2..0000000 --- a/src/scratch/nos.xq +++ /dev/null @@ -1,20 +0,0 @@ -(:~ describe book page numbering :) - -import module namespace pdfbox="urn:expkg-zone58:pdfbox3" ; -import module namespace bookpages="urn:bookpages" at "../lib/bookpages.xqm"; -import module namespace pdfscrape="urn:pdfscrape" at "../lib/pdfscrape.xqm"; - -declare variable $base:="C:\Users\mrwhe\Desktop\1e\"; -declare variable $tests:=map{ - "simple":"20#C,R,7D", - "set\2-6-2\A5267C": "1037#C,r,28D,520r:V2,526D@493", - "gpg-book\2-3\A3581C-TRD": "848#C,r:Vol1:,28D,400r:Vol2:,438D@401" -}; -let $pdf:=pdfbox:open("C:\Users\mrwhe\Desktop\1e\set\2-6-2\A5267C\257273---Book_File-Web_PDF_9798400612572_486638.pdf") -let $l:=pdfbox:getPageLabels($pdf) - -let $index:=bookpages:expand($tests?"set\2-6-2\A5267C") -return pdfscrape:score($l,pdfscrape:page-report($pdf)) - - - diff --git a/src/scratch/pdfbox.xq b/src/scratch/pdfbox.xq deleted file mode 100644 index 1834e2e..0000000 --- a/src/scratch/pdfbox.xq +++ /dev/null @@ -1,23 +0,0 @@ -(: PDFBOX experiments -:) - -import module namespace pdfbox="urn:expkg-zone58:pdfbox3" at "../lib/pdfbox3.xqm"; - - -declare variable $samples:= map{ - "climate": "data\drop-01d\set\2-6-1\A5579C_1\271989---Book_File-Web_PDF_9798400627484_486728.pdf", - "women": "data\drop-01d\set\2-6-1\A6229C_1\257334---Book_File-Web_PDF_9798216172628_486742.pdf", - "genocide": "data\drop1-pdf\GR2967-TRD\272791---Book_File-Web_PDF_9798400640216_486366.pdf", - "world": "data\drop-01c\gpg-book\2-6\A3506C-TRD\256186---Book_File-Web_PDF_9798216038955_486148.pdf" -}; -declare variable $base:= "C:\Users\mrwhe\git\bloomsbury\content-architecture\xquery\ABC-CLIO\data"; -(:~ resolve :) -declare variable $PDF:= -$samples?world=>file:resolve-path($base) -(: "C:\Users\mrwhe\git\expkg-zone58\pdfbox\samples.pdf\icelandic-dictionary.pdf" :) -; - - - - - pdfbox:report($PDF) \ No newline at end of file