1
0

[add] image save

This commit is contained in:
Andy Bunce 2024-04-11 12:45:23 +01:00
parent bd5b7cf006
commit af2fac39b2
7 changed files with 66 additions and 7 deletions

View File

@ -1,5 +1,5 @@
{
"basexTools.xquery.profile": "basex-10",
"basexTools.xquery.showHovers": false,
"basexTools.xquery.showHovers": true,
}

View File

@ -1 +1 @@
{"cells":[{"kind":1,"language":"markdown","value":"# PDFBox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"It comes with the useful PDF debug tool `java -jar debugger-app-3.0.1.jar`"},{"kind":1,"language":"markdown","value":"## Set up XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace bookpages = 'urn:bookpages' at \"../src/lib/bookpages.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};\r\ndeclare variable $PDF:= (: $samples?women=>file:resolve-path($config:data) :)\r\n\"C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO\\data\\drop-01e\\set\\2-6-1\\A5690C_1\\257107---Book_File-Web_PDF_9798400691218_486731.pdf\";"},{"kind":1,"language":"markdown","value":"# Version"},{"kind":1,"language":"markdown","value":" ## pdfbox version"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":1,"language":"markdown","value":"PDF specification version used by document"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:pdfVersion()"},{"kind":1,"language":"markdown","value":"# Page count for PDF"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:page-count()"},{"kind":1,"language":"markdown","value":"## save range to new pdf"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:extract(2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":1,"language":"markdown","value":"### sequence of maps"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:outline()"},{"kind":1,"language":"markdown","value":"XML"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:outline()=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:pageLabels()"},{"kind":1,"language":"markdown","value":"# getText from page index"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"## PageNo text analysis"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pagenos:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"# Inverted pageno map"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pagenos:page-report($doc)=>pagenos:inverted-map()"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\n(: let $outline:=pdfbox:outline($doc) :)\r\nlet $count:=pdfbox:page-count($doc)\r\norder by $count \r\nreturn ``[`{$f}`: `{ $count }`]``"}]}
{"cells":[{"kind":1,"language":"markdown","value":"# PDFBox3 \r\nA BaseX 10+ interface to Apache PDFBox® library version 3 \r\n## Apache PDFBox® - A Java PDF Library\r\n\r\nThe Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.\r\nhttps://pdfbox.apache.org/"},{"kind":1,"language":"markdown","value":"It comes with the useful PDF debug tool `java -jar debugger-app-3.0.1.jar`"},{"kind":1,"language":"markdown","value":"## Set up XQuery context for following code..."},{"kind":2,"language":"xquery","value":"(:<:)(: XQuery Context :)\r\nimport module namespace pdfbox = \"urn:expkg-zone58:pdfbox3\" at \"../src/lib/pdfbox3.xqm\";\r\nimport module namespace bookpages = 'urn:bookpages' at \"../src/lib/bookpages.xqm\";\r\nimport module namespace pdfscrape = 'urn:pdfscrape' at \"../src/lib/pdfscrape.xqm\";\r\nimport module namespace config = 'urn:abc-clio:config' at 'C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO/lib/abc-config.xqm';\r\n\r\ndeclare variable $samples:= map{\r\n \"climate\": \"drop-01d\\set\\2-6-1\\A5579C_1\\271989---Book_File-Web_PDF_9798400627484_486728.pdf\",\r\n \"women\": \"drop-01d\\set\\2-6-1\\A6229C_1\\257334---Book_File-Web_PDF_9798216172628_486742.pdf\",\r\n \"genocide\": \"drop1-pdf\\GR2967-TRD\\272791---Book_File-Web_PDF_9798400640216_486366.pdf\",\r\n \"world\": \"drop-01c\\gpg-book\\2-6\\A3506C-TRD\\256186---Book_File-Web_PDF_9798216038955_486148.pdf\"\r\n};\r\ndeclare variable $PDF:= (: $samples?women=>file:resolve-path($config:data) :)\r\n\"C:\\Users\\mrwhe\\git\\bloomsbury\\content-architecture\\xquery\\ABC-CLIO\\data\\drop-01e\\set\\2-6-1\\A5690C_1\\257107---Book_File-Web_PDF_9798400691218_486731.pdf\";"},{"kind":1,"language":"markdown","value":" ## Check pdfbox version"},{"kind":2,"language":"xquery","value":"pdfbox:version()"},{"kind":1,"language":"markdown","value":"PDF specification version used by document"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:pdfVersion()"},{"kind":1,"language":"markdown","value":"# Page count for PDF"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:page-count()"},{"kind":1,"language":"markdown","value":"# save range to new pdf"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:extract(2,12,\"c:\\tmp\\a.pdf\")"},{"kind":1,"language":"markdown","value":"## Outline / bookmarks"},{"kind":1,"language":"markdown","value":"### sequence of maps"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:outline()"},{"kind":1,"language":"markdown","value":"XML"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)=>pdfbox:outline()=>pdfbox:outline-xml()"},{"kind":1,"language":"markdown","value":"## Page labels"},{"kind":2,"language":"xquery","value":"\r\npdfbox:open($PDF)=>pdfbox:pageLabels()"},{"kind":1,"language":"markdown","value":"# getText from page index"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfbox:getText($doc,56)"},{"kind":1,"language":"markdown","value":"# Page scraping"},{"kind":1,"language":"markdown","value":"## pdf scrape text analysis"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)\r\n"},{"kind":1,"language":"markdown","value":"## Inverted pageno map"},{"kind":2,"language":"xquery","value":"let $doc:=pdfbox:open($PDF)\r\nreturn pdfscrape:page-report($doc)=>pdfscrape:inverted-map()"},{"kind":1,"language":"markdown","value":"# Save images"},{"kind":2,"language":"xquery","value":"pdfbox:open($PDF)\r\n=> pdfbox:pageAsImage(0,0.25)\r\n=> pdfbox:imageSave(\"c:\\tmp\\page0.gif\",\"gif\")"},{"kind":1,"language":"markdown","value":"## report"},{"kind":2,"language":"xquery","value":"declare variable $a:=file:resolve-path(\"../data/1e/\",file:base-dir());\r\n\r\nfor $f in file:list($a,true(),\"*.pdf\") \r\nwhere not(contains($f,\"outputs\"))\r\nlet $doc:=pdfbox:open(file:resolve-path($f,$a))\r\n(: let $outline:=pdfbox:outline($doc) :)\r\nlet $count:=pdfbox:page-count($doc)\r\norder by $count \r\nreturn ``[`{$f}`: `{ $count }`]``"}]}

View File

@ -1 +1,5 @@
Examples with pageLabels and outlines
# Example PDFs with pageLabels and outlines
* [BaseX100.pdf](https://files.basex.org/releases/10.0/BaseX100.pdf)
* [icelandic-dictionary.pdf](http://css4.pub/2015/icelandic/dictionary.pdf)
* [page-numbers.pdf](https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers.pdf)

View File

@ -40,7 +40,7 @@ declare namespace PDDocumentInformation ="java:org.apache.pdfbox.pdmodel.PDDocum
@see https://javadoc.io/static/org.apache.pdfbox/pdfbox/3.0.0/org/apache/pdfbox/pdmodel/interactive/documentnavigation/outline/PDOutlineItem.html
:)
declare namespace PDOutlineItem="java:org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem";
declare namespace PDFRenderer="java:org.apache.pdfbox.rendering.PDFRenderer";
declare namespace RandomAccessReadBufferedFile = "java:org.apache.pdfbox.io.RandomAccessReadBufferedFile";
declare namespace File ="java:java.io.File";
@ -220,6 +220,7 @@ as xs:string*
=>PDDocumentCatalog:getPageLabels()
=>PDPageLabels:getLabelsByPageIndices()
};
(:~ return text on $pageNo :)
declare function pdfbox:getText($doc as item(), $pageNo as xs:integer)
as xs:string{
@ -231,6 +232,7 @@ as xs:string{
return (# db:checkstrings #) {PDFTextStripper:getText($tStripper,$doc)}
};
(:~ summary info as map for $pdfpath :)
declare function pdfbox:report($pdfpath as xs:string)
as map(*){
let $doc:=pdfbox:open($pdfpath)
@ -242,8 +244,17 @@ as map(*){
)=>map:merge()
};
(: @TODO :)
declare function pdfbox:pageAsImage($doc as item(), $pageNo as xs:integer)
(:~ page (ZERO based) as image
@param $scale 1=72 dpi :)
declare function pdfbox:pageAsImage($doc as item(), $pageNo as xs:integer,$scale as xs:float)
as item(){
(: BufferedImage image = pdfRenderer.renderImageWithDPI(i, 200, ImageType.RGB) :)
PDFRenderer:new($doc)
=>PDFRenderer:renderImage($pageNo,$scale)
};
(:~ save bufferedimage to $dest
@param $type = "gif","png" etc:)
declare function pdfbox:imageSave($bufferedImage as item(),$dest as xs:string,$type as xs:string)
as xs:boolean{
Q{java:javax.imageio.ImageIO}write($bufferedImage , $type, File:new($dest))
};

8
src/scratch/abc2.xq Normal file
View File

@ -0,0 +1,8 @@
declare variable $samples:= map{
"climate": "data\drop-01d\set\2-6-1\A5579C_1\271989---Book_File-Web_PDF_9798400627484_486728.pdf",
"women": "data\drop-01d\set\2-6-1\A6229C_1\257334---Book_File-Web_PDF_9798216172628_486742.pdf",
"genocide": "data\drop1-pdf\GR2967-TRD\272791---Book_File-Web_PDF_9798400640216_486366.pdf",
"world": "data\drop-01c\gpg-book\2-6\A3506C-TRD\256186---Book_File-Web_PDF_9798216038955_486148.pdf"
};
declare variable $base:= "C:\Users\mrwhe\git\bloomsbury\content-architecture\xquery\ABC-CLIO\data";
42

View File

@ -6,6 +6,10 @@ html {
padding: 0;
font-family: 'Open Sans', sans-serif;
}
sl-card {
display: flex;
width: 100px;
}
.App {
width: 800px;

View File

@ -0,0 +1,32 @@
<!doctype html>
<script type="module">
import { define, store, html } from 'https://esm.sh/hybrids@^8';
const User = {
id: true,
res: null,
firstName: "",
lastName: "",
[store.connect]: {
get: id => fetch(`http://localhost:8080/pdf/api/sources`)
.then(data => data.json())
.then(res=> { "43"})
},
};
define({
tag: "user-details",
user: store(User),
render: ({ user }) => html`
<div>
${store.pending(user) && `Loading...`}
${store.error(user) && `Something went wrong...`}
${store.ready(user) && html`
<p>${user.id} ${user.lastName}</p>
`}
</div>
`,
});
</script>
<user-details user="2"></user-details>