aaronmauro/pdf_to_txt.py

## pdf_to_txt.py
#!/usr/bin/env python
# coding: utf-8

# # Convert PDF to TXT
#
# [Types of PDFs according to Adobe](https://www.adobe.com/uk/acrobat/resources/document-files/pdf-types.html)
#
# > There are nine different PDF types:
# > - PDF. This PDF file type is seen as the ‘standard’ PDF format. It is often used for sharing and viewing files online.
# > - PDF/A. This type of PDF file is often used by managers and archivists who require long-term file storage. It also has a restricted set of features, including JavaScript, audio and video content.
# > - PDF/E. This format supports construction, engineering and manufacturing specifications and is often used in those industries.
# > - PDF/X. Commonly used by graphic designers and print professionals, this PDF file type is designed to better support graphics when sharing and printing.
# > - PDF/VT. This file is similar to the PDF/X format but features additional customisation features. Again, the PDF/VT file type is most suited to print professionals and graphic designers.
# > - PDF/UA. This type of format is compatible with assistive technology, enhancing readability and navigation for people with disabilities.
# > - PAdEs. Sets the standards for PDF Advanced Electronic Signatures in line with major legislation.
# > - PDF Healthcare. This standard was developed to secure best practice for handling and managing healthcare information.
# > - Searchable PDF. A searchable PDF is essentially a standard PDF file with a search function. The basic function of a searchable-PDF is to make image-based PDFs text-searchable.
#

# In[ ]:


__author__ = "Aaron Mauro"
__role__ = "educator"
__institution__ = "Brock University"
__workshop__ = "Digital Humanities Summer Institute"
__email__ = "amauro@brocku.ca"
__license__ = "creative commons by-sa"
__version__ = "0.1"


# In[ ]:


get_ipython().system('pip install PyPDF2')


# In[ ]:


import PyPDF2

with open('example.pdf','rb') as pdf_file: #opens pdf in relative path
    reader = PyPDF2.PdfReader(pdf_file) #build reader object and pass in pdf file data
    num_pages = len(reader.pages) #collect number of pages
    with open("./out.txt","a") as outfile: #open out.txt file in append mode
        for i in range(0, num_pages): #loop over pages in pdf
            page = reader.pages[i] #gather page data for each page
            text = page.extract_text() #convert page to text
            outfile.writelines(text) #write page to file and repeat
	#!/usr/bin/env python
	# coding: utf-8

	# # Convert PDF to TXT
	#
	# [Types of PDFs according to Adobe](https://www.adobe.com/uk/acrobat/resources/document-files/pdf-types.html)
	#
	# > There are nine different PDF types:
	# > - PDF. This PDF file type is seen as the ‘standard’ PDF format. It is often used for sharing and viewing files online.
	# > - PDF/A. This type of PDF file is often used by managers and archivists who require long-term file storage. It also has a restricted set of features, including JavaScript, audio and video content.
	# > - PDF/E. This format supports construction, engineering and manufacturing specifications and is often used in those industries.
	# > - PDF/X. Commonly used by graphic designers and print professionals, this PDF file type is designed to better support graphics when sharing and printing.
	# > - PDF/VT. This file is similar to the PDF/X format but features additional customisation features. Again, the PDF/VT file type is most suited to print professionals and graphic designers.
	# > - PDF/UA. This type of format is compatible with assistive technology, enhancing readability and navigation for people with disabilities.
	# > - PAdEs. Sets the standards for PDF Advanced Electronic Signatures in line with major legislation.
	# > - PDF Healthcare. This standard was developed to secure best practice for handling and managing healthcare information.
	# > - Searchable PDF. A searchable PDF is essentially a standard PDF file with a search function. The basic function of a searchable-PDF is to make image-based PDFs text-searchable.
	#

	# In[ ]:


	__author__ = "Aaron Mauro"
	__role__ = "educator"
	__institution__ = "Brock University"
	__workshop__ = "Digital Humanities Summer Institute"
	__email__ = "amauro@brocku.ca"
	__license__ = "creative commons by-sa"
	__version__ = "0.1"


	# In[ ]:


	get_ipython().system('pip install PyPDF2')


	# In[ ]:


	import PyPDF2

	with open('example.pdf','rb') as pdf_file: #opens pdf in relative path
	reader = PyPDF2.PdfReader(pdf_file) #build reader object and pass in pdf file data
	num_pages = len(reader.pages) #collect number of pages
	with open("./out.txt","a") as outfile: #open out.txt file in append mode
	for i in range(0, num_pages): #loop over pages in pdf
	page = reader.pages[i] #gather page data for each page
	text = page.extract_text() #convert page to text
	outfile.writelines(text) #write page to file and repeat
No results found