%pip install pdfplumber Unidecode“Extract text from pdf”
“How to use extract text from a pdf”
- toc: false
- branch: master
- badges: true
- comments: true
- categories: [snippet]
- hide: true
- search_exclude: false
- metadata_key1: metadata_value1
- metadata_key2: metadata_value2
- food: food
Notes
Packages
from typing import List
import pdfplumber
from unidecode import unidecodedef pdf_parser(
filepath: str,
x_tolerance=1,
y_tolerance=1
) -> List[str]:
texts = []
with pdfplumber.open(filepath) as pdf:
for page in pdf.pages:
texts.append(unidecode(page.extract_text(x_tolerance=x_tolerance, y_tolerance=y_tolerance)))
return textspdf_texts = pdf_parser(filepath='my_sample.pdf')
print(pdf_texts)