pdf to text using pytesseract

step-1

first download poppler from here....

step-2

pip install opencv-python

step-3

pip install pdf2image

from pdf2image import convert_from_path

import pytesseract

import cv2

images = convert_from_path(r"C:\Users\darsh\OneDrive\Desktop\ML\what is ML.pdf", 500,poppler_path=r"C:\Program Files\poppler-0.68.0_x86\poppler-0.68.0\bin")

for i in range(len(images)):

# Save pages as images in the pdf

images[i].save('page'+ str(i) +'.jpg', 'JPEG')

src = cv2.imread('page'+ str(i) +'.jpg');

img1 = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY)

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract"

text = pytesseract.image_to_string(img1)

print(text)

gujju computervalo

Search This Blog