You are on page 1of 2

import csv

import io
import os
import re
import pdfminer.high_level
from reportlab.pdfgen import canvas

import re

def redact_pii(text):
# Phone numbers
text = re.sub(r"\b\d{10}\b", "xxxxxx", text)

# Email addresses
text = re.sub(r"\S+@\S+\.\S+", "yyyyyy", text)

# Organization names
org_names = ["Google", "Microsoft", "Apple"]
for org_name in org_names:
text = re.sub(org_name, "ORGANIZATION_NAME", text, flags=re.IGNORECASE)

# Employee names
emp_names = ["Jane", "John", "Mary"]
for emp_name in emp_names:
text = re.sub(emp_name, "EMPLOYEE_NAME", text, flags=re.IGNORECASE)

return text

# Open the PDF file and read its contents


with open("/content/demo.pdf", "rb") as pdf_file:
content = pdfminer.high_level.extract_text(pdf_file)

# Create a CSV writer to save the redacted PII content


with open("redact.csv", "w", newline="") as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Entity", "Type"])

# Redact PII content and save to CSV


redacted_content = redact_pii(content)
phone_numbers = re.findall(r"(?<!Phone\s)\b\d{10}\b", content)
for phone_number in phone_numbers:
csv_writer.writerow([phone_number, "Phone no"])

org_names = ["Google", "Microsoft", "Apple"]


for org_name in org_names:
if org_name.lower() in content.lower():
csv_writer.writerow([org_name, "Organization"])

emp_names = ["Jane", "John", "Mary"]


for emp_name in emp_names:
if emp_name.lower() in content.lower():
csv_writer.writerow([emp_name, "Employee"])

# Save the redacted PDF file


pdf_canvas = canvas.Canvas("redacted_demo.pdf")
y = 750
for line in redacted_content.split("\n"):
pdf_canvas.drawString(50, y, line)
y -= 20
pdf_canvas.save()

You might also like