pdf to text using pytesseract


 


step-1

first download poppler from here....

step-2

pip install opencv-python

step-3


pip install pdf2image

from pdf2image import convert_from_path

import pytesseract

import cv2


images = convert_from_path(r"C:\Users\darsh\OneDrive\Desktop\ML\what is ML.pdf", 500,poppler_path=r"C:\Program Files\poppler-0.68.0_x86\poppler-0.68.0\bin")


for i in range(len(images)):

    # Save pages as images in the pdf

    images[i].save('page'+ str(i) +'.jpg', 'JPEG')

    src = cv2.imread('page'+ str(i) +'.jpg');

    img1 = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY)

    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract"

    text = pytesseract.image_to_string(img1)

    print(text)



Comments