%pip install azure-ai-textanalytics pdfplumber Unidecode python-dotenv
“Azure Text Summarization”
“How to use Azure Text Summarization with PDF, TXT and simple text”
- toc: false
- branch: master
- badges: true
- comments: true
- categories: [azure, cognitive services, summarization]
- hide: true
- search_exclude: false
- metadata_key1: metadata_value1
- metadata_key2: metadata_value2
from typing import List
import pdfplumber
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
from azure.ai.textanalytics import ExtractSummaryAction
from dotenv import load_dotenv
import os
from unidecode import unidecode
= ''
DOTENV_FILEPATH = os.getenv('CV_ENDPOINT')
CS_ENDPOINT = os.getenv('CV_KEY') CS_KEY
# https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/textanalytics/azure-ai-textanalytics/samples/sample_extract_summary.py
def pdf_parser(
str,
filepath: =1,
x_tolerance=1
y_tolerance-> List[str]:
) = []
texts
with pdfplumber.open(filepath) as pdf:
for page in pdf.pages:
=x_tolerance, y_tolerance=y_tolerance)))
texts.append(unidecode(page.extract_text(x_tolerancereturn texts
def split_in_chunks(lst, chunk_size: int):
= list()
chunked_list for i in range(0, len(lst), chunk_size):
+chunk_size])
chunked_list.append(lst[i:ireturn chunked_list
def az_summary(
str],
texts: List[str,
cs_endpoint: str,
cs_key: str
language:
):= []
az_doc for i in range(len(texts)):
= {"id": i, "language": language, "text": texts[i]}
doc
az_doc.append(doc)break
= TextAnalyticsClient(
text_analytics_client =cs_endpoint,
endpoint=AzureKeyCredential(cs_key),
credential
)
= text_analytics_client.begin_analyze_actions(
poller =texts,
documents=[
actions='rank'),
ExtractSummaryAction(order_by
],
)
= []
extract_summary_results
= poller.result()
document_results for result in document_results:
for ex in result:
# print(result[0])
if not ex['is_error']:
extract_summary_results.append(ex)return extract_summary_results
def summarize(summaries, thr=0):
= []
sentences for sr in summaries:
for sentence in sr.sentences:
if sentence.rank_score >= thr:
sentences.append(sentence.text)= list(set(sentences))
sentences return sentences
def summarize_pdf(
str,
filepath: str,
cs_endpoint: str,
cs_key: str,
language: =0
thr
):= pdf_parser(filepath=filepath)
pdf_text = split_in_chunks(
chunks =pdf_text,
lst=25
chunk_size
)= []
summaries for texts in chunks:
= az_summary(
st =texts,
texts=cs_endpoint,
cs_endpoint=cs_key,
cs_key=language
language
)
summaries.extend(st)
= summarize(summaries, thr)
sentences return sentences
def summarize_txt(
str,
filepath: str,
cs_endpoint: str,
cs_key: str,
language: =0
thr
):with open(filepath, 'r', encoding='utf-8') as fh:
= fh.read()
num_list = az_summary(
summary =[num_list],
texts=cs_endpoint,
cs_endpoint=cs_key,
cs_key=language
language
)= summarize(summary, thr)
sentences return sentences
load_dotenv(DOTENV_FILEPATH)
= summarize_pdf(
summary_pdf ='my_sample.pdf',
filepath=CS_ENDPOINT,
cs_endpoint=CS_KEY,
cs_key='en',
language=0.5
thr
)print(summary_pdf)
= summarize_txt(
summary_txt ='my_sample.txt',
filepath=CS_ENDPOINT,
cs_endpoint=CS_KEY,
cs_key='en',
language=0.5
thr
)print(summary_txt)
= az_summary(
summary_text =["""My sample text"""],
texts=CS_ENDPOINT,
cs_endpoint=CS_KEY,
cs_key='en',
language=0.5
thr
)print(summary_text)