diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..b87e2eb --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,30 @@ +name: Run BaseX Tests + +on: + pull_request: + branches: + - main + workflow_dispatch: # Enables manual trigger + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Java + uses: actions/setup-java@v2 + with: + java-version: '11' + distribution: 'temurin + + - name: Install BaseX + run: | + wget http://files.basex.org/releases/10.7/BaseX107.zip + unzip BaseX107.zip -d basex + + - name: Run BaseX Tests + run: | + ./basex/bin/basex -t . \ No newline at end of file diff --git a/lib/pdfbox-3.0.2/commons-logging-1.3.1.jar b/lib/pdfbox-3.0.2/commons-logging-1.3.1.jar new file mode 100644 index 0000000..0d508f6 Binary files /dev/null and b/lib/pdfbox-3.0.2/commons-logging-1.3.1.jar differ diff --git a/lib/pdfbox-3.0.2/fontbox-3.0.2.jar b/lib/pdfbox-3.0.2/fontbox-3.0.2.jar new file mode 100644 index 0000000..3d09a8e Binary files /dev/null and b/lib/pdfbox-3.0.2/fontbox-3.0.2.jar differ diff --git a/lib/pdfbox-3.0.2/pdfbox-3.0.2.jar b/lib/pdfbox-3.0.2/pdfbox-3.0.2.jar new file mode 100644 index 0000000..234fbbd Binary files /dev/null and b/lib/pdfbox-3.0.2/pdfbox-3.0.2.jar differ diff --git a/lib/pdfbox-3.0.2/pdfbox-io-3.0.2.jar b/lib/pdfbox-3.0.2/pdfbox-io-3.0.2.jar new file mode 100644 index 0000000..b151bb8 Binary files /dev/null and b/lib/pdfbox-3.0.2/pdfbox-io-3.0.2.jar differ diff --git a/package.json b/package.json index 52ffdb2..841dea3 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,7 @@ "doc": "docs" }, "scripts": { - "test": "basex -t ." + "test": "%BASEX10%/bin/basex -t ." }, "keywords": [ "pdf", diff --git a/samples.pdf/Legal_RAG_Hallucinations.pdf b/samples.pdf/Legal_RAG_Hallucinations.pdf new file mode 100644 index 0000000..260b364 Binary files /dev/null and b/samples.pdf/Legal_RAG_Hallucinations.pdf differ diff --git a/samples.pdf/readme.md b/samples.pdf/readme.md index ed973fd..70bb392 100644 --- a/samples.pdf/readme.md +++ b/samples.pdf/readme.md @@ -4,4 +4,5 @@ * [BaseX100.pdf](https://files.basex.org/releases/10.0/BaseX100.pdf) * [icelandic-dictionary.pdf](http://css4.pub/2015/icelandic/dictionary.pdf) * [page-numbers.pdf](https://www.w3.org/WAI/WCAG22/working-examples/pdf-page-numbers/page-numbers). -* [Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans](https://www.lse.ac.uk/News/News-Assets/PDFs/2021/Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans-Final-Report-November-2021.pdfpdf) +* [Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans](https://www.lse.ac.uk/News/News-Assets/PDFs/2021/Sentience-in-Cephalopod-Molluscs-and-Decapod-Crustaceans-Final-Report-November-2021.pdf) +* [Legal RAG Hallucinations](https://law.stanford.edu/wp-content/uploads/2024/05/Legal_RAG_Hallucinations.pdf) diff --git a/src/lib/pdfscrape.xqm b/src/lib/pdfscrape.xqm index dfde25c..9304570 100644 --- a/src/lib/pdfscrape.xqm +++ b/src/lib/pdfscrape.xqm @@ -35,7 +35,7 @@ as element(page){ return { $found, $line1 } }; -(:~ empty or attributes created by matching $style with $line1 :) +(:~ attributes created by matching $style with $line1 or empty :) declare function pdfscrape:line-report($style as xs:string, $line1 as xs:string) as attribute(*)*{ if(matches($line1,$pdfscrape:pats?($style))) @@ -52,10 +52,12 @@ as map(*) { $pages[@number]!map:entry(string(@number),string(@index)) =>map:merge(map{"duplicates":"combine"}) }; + (:~ %match $l page labels :) -declare function pdfscrape:score($l as xs:string*,$report as element(page)*) +declare function pdfscrape:score($l as xs:string*, + $report as element(page)*) { let $s:=$report!(if(@number)then string(@number) else "") let $match:= for-each-pair($l,$s,function($l,$s){if($s eq "")then 0 else if ($s eq $l)then 1 else -1}) @@ -76,3 +78,7 @@ as xs:integer{ => array:head() }; +declare function pdfscrape:characters($str as xs:string) +{ + +}; diff --git a/src/scratch/pdfbox.xq b/src/scratch/pdfbox.xq index 36c67a5..1834e2e 100644 --- a/src/scratch/pdfbox.xq +++ b/src/scratch/pdfbox.xq @@ -19,6 +19,5 @@ $samples?world=>file:resolve-path($base) -let $doc:=pdfbox:open($PDF) -return pdfbox:information($doc) \ No newline at end of file + pdfbox:report($PDF) \ No newline at end of file