defmodule MyApp.PyPDFParser do @behaviour Arcana.FileParser.PDF @impl true def parse(path, _opts) when is_binary(path) do python_script = """ import sys from PyPDF2 import PdfReader reader = PdfReader(sys.argv[1]) text = '' for page in reader.pages: text += page.extract_text() print(text) """ case System.cmd("python3", ["-c", python_script, path]) do {text, 0} -> {:ok, text} {error, _} -> {:error, {:pypdf_failed, error}} end end def supports_binary?, do: falseend
High-performance native PDF parsing:
defmodule MyApp.RustPDFParser do @behaviour Arcana.FileParser.PDF use Rustler, otp_app: :my_app, crate: :pdf_parser # NIF stub - implemented in Rust def parse_pdf_native(_path), do: :erlang.nif_error(:nif_not_loaded) @impl true def parse(path, _opts) when is_binary(path) do case parse_pdf_native(path) do {:ok, text} -> {:ok, text} {:error, reason} -> {:error, reason} end end def supports_binary?, do: falseend
Some parsers can process PDF binary content directly (useful for file uploads):
defmodule MyApp.BinaryPDFParser do @behaviour Arcana.FileParser.PDF @impl true def parse(binary, opts) when is_binary(binary) do # Handle both file paths and binary content if File.exists?(binary) do # It's a file path File.read!(binary) |> extract_text(opts) else # It's binary content extract_text(binary, opts) end end # Declare binary support def supports_binary?, do: true defp extract_text(binary, _opts) do # Your PDF extraction logic {:ok, "extracted text"} endend
defmodule MyApp.OCRParser do @behaviour Arcana.FileParser.PDF @impl true def parse(path, opts) do language = opts[:language] || "eng" # First try normal text extraction case Arcana.FileParser.PDF.Poppler.parse(path, opts) do {:ok, text} when byte_size(text) > 100 -> # Got enough text, no OCR needed {:ok, text} _ -> # Fallback to OCR perform_ocr(path, language) end end defp perform_ocr(path, language) do # Use Tesseract via ImageMagick + Tesseract case System.cmd("sh", ["-c", " pdftoppm #{path} page -png | tesseract stdin stdout -l #{language} "]) do {text, 0} -> {:ok, text} {error, _} -> {:error, {:ocr_failed, error}} end end def supports_binary?, do: falseend
defmodule MyApp.TableParser do @behaviour Arcana.FileParser.PDF @impl true def parse(path, opts) do # Use tabula-py or camelot-py for table extraction case System.cmd("python3", ["-c", " import tabula tables = tabula.read_pdf('#{path}', pages='all') text = '\\n'.join([df.to_string() for df in tables]) print(text) "]) do {text, 0} -> {:ok, text} {error, _} -> {:error, {:table_extraction_failed, error}} end end def supports_binary?, do: falseend
defmodule MyApp.MetadataPDFParser do @behaviour Arcana.FileParser.PDF @impl true def parse(path, opts) do with {:ok, text} <- extract_text(path), {:ok, metadata} <- extract_metadata(path) do # Include metadata in the text for ingestion enriched_text = """ Title: #{metadata[:title]} Author: #{metadata[:author]} Created: #{metadata[:created]} #{text} """ {:ok, enriched_text} end end defp extract_text(path) do Arcana.FileParser.PDF.Poppler.parse(path, []) end defp extract_metadata(path) do case System.cmd("pdfinfo", [path]) do {info, 0} -> metadata = parse_pdfinfo(info) {:ok, metadata} _ -> {:ok, %{}} end end defp parse_pdfinfo(info) do info |> String.split("\n") |> Enum.reduce(%{}, fn line, acc -> case String.split(line, ":", parts: 2) do [key, value] -> Map.put(acc, String.downcase(key) |> String.to_atom(), String.trim(value)) _ -> acc end end) end def supports_binary?, do: falseend
defmodule MyApp.PDFParserTest do use ExUnit.Case @fixtures_path "test/fixtures/pdfs" test "parses simple PDF" do path = Path.join(@fixtures_path, "simple.pdf") {:ok, text} = MyApp.CustomPDFParser.parse(path, []) assert text =~ "expected content" assert String.length(text) > 100 end test "handles multi-page PDFs" do path = Path.join(@fixtures_path, "multipage.pdf") {:ok, text} = MyApp.CustomPDFParser.parse(path, []) assert text =~ "page 1 content" assert text =~ "page 2 content" end test "returns error for corrupted PDF" do path = Path.join(@fixtures_path, "corrupted.pdf") assert {:error, _} = MyApp.CustomPDFParser.parse(path, []) end test "supports binary content" do path = Path.join(@fixtures_path, "simple.pdf") binary = File.read!(path) if MyApp.CustomPDFParser.supports_binary?() do {:ok, text} = MyApp.CustomPDFParser.parse(binary, []) assert text =~ "expected content" end endend