“Extract text from pdf”

“How to use extract text from a pdf”

Notes

Packages

%pip install pdfplumber Unidecode
from typing import List
import pdfplumber
from unidecode import unidecode
def pdf_parser(
    filepath: str,
    x_tolerance=1,
    y_tolerance=1
) -> List[str]:
    texts = []
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            texts.append(unidecode(page.extract_text(x_tolerance=x_tolerance, y_tolerance=y_tolerance)))
    return texts
pdf_texts = pdf_parser(filepath='my_sample.pdf')
print(pdf_texts)