Skip to content

Instantly share code, notes, and snippets.

@IllustratedMan-code
Created June 26, 2025 13:45
Show Gist options
  • Select an option

  • Save IllustratedMan-code/e39c6e70c0e8e12e35f81a025bb2e880 to your computer and use it in GitHub Desktop.

Select an option

Save IllustratedMan-code/e39c6e70c0e8e12e35f81a025bb2e880 to your computer and use it in GitHub Desktop.
A python script that extracts the text from a particular pdf document
import fitz
doc = fitz.open("sapp_pwms.pdf")
pages = [d.get_text("dict") for d in doc]
def inspect(page, block):
text_in_block = []
return [{"text":j["text"], "coords": list(j["bbox"])}
for i in page["blocks"][block]["lines"]
for j in i["spans"] if not j["text"] == ""]
for i in page["blocks"][block]["lines"]:
for j in i["spans"]:
if not j["text"]:
text_in_block += [{"text":j["text"], "coords": list(j["bbox"])}]
return text_in_block
def merge_text(i, j, sep=""):
i["text"] = i["text"] + sep + j["text"]
i["coords"][0] = min(i["coords"][0], j["coords"][0])
i["coords"][1] = min(i["coords"][1], j["coords"][1])
i["coords"][2] = max(i["coords"][2], j["coords"][2])
i["coords"][3] = max(i["coords"][3], j["coords"][3])
title = "".join([d["text"] for d in inspect(pages[0], 0)])
columns = [d for d in inspect(pages[0], 1) if not (d["text"].isspace())]
del pages[0]["blocks"][0] # deletes title block
del pages[0]["blocks"][0] # deletes column block
def merge_block(block, sep="|"):
x_coords = {s["coords"][0]:i for i,s in enumerate(block)}
x_coords = {}
for i,s in enumerate(block):
if s["coords"][0] not in x_coords:
x_coords[s["coords"][0]] = i
else:
index = x_coords[s["coords"][0]]
merge_text(block[index], block[i], sep=sep)
del block[i]
# print(block)
# print(x_coords)
merge_block(columns, sep="") # fixes columns with long names
for index, i in enumerate(columns):
i["text"] = i["text"].strip() # fixes whitespace
if index < len(columns)-1:
i["coords"][2] = columns[index+1]["coords"][0]
def column_assign(text):
rowname = {"text": "rowname", "coords": (0, 0, columns[0]["coords"][0])}
for i in [rowname] + columns:
if text["coords"][0] < i["coords"][2]:
# print(columns)
# print(text)
return i["text"]
sum = 0
def parse_pwm(pwm_column):
global sum
pwms = []
current_pwm = {"A":[], "C":[], "G":[], "T":[]}
mode = "0"
for i in pwm_column:
if any(x in i for x in ["A", "C", "G", "T"]):
previous_mode = mode
mode = i[0]
if mode == "A" and previous_mode == "T":
pwms.append(current_pwm)
current_pwm = {"A":[], "C":[], "G":[], "T":[]}
continue
if mode != "0":
current_pwm[mode].append(i)
pwms.append(current_pwm)
return pwms
def parse_page(page):
column_assignments = {}
for i in range(len(page["blocks"])):
i = inspect(page, i)
if len(i) == 0:
continue
if column_assign(i[0]) != "Frecuency PWM":
merge_block(i)
for j in i:
assignment = column_assign(j)
if assignment not in column_assignments:
column_assignments[assignment] = [j["text"].strip()]
else:
column_assignments[assignment].append(j["text"].strip())
del column_assignments["Design"][-1] # delete page number
global sum
# sum += column_assignments["Frecuency PWM"].count("T:")
column_assignments["Frecuency PWM"] = parse_pwm(column_assignments["Frecuency PWM"])
#sum += len(column_assignments["Frecuency PWM"])
return column_assignments
all_pages = [parse_page(pg) for pg in pages]
concatenated_pages = {}
for i in all_pages:
for key,value in i.items():
if key in concatenated_pages:
concatenated_pages[key] += value
else:
concatenated_pages[key] = value
def find_blanks_and_remove_whitespace(concatenated_pages):
blank = False
for index, value in enumerate(concatenated_pages["Rank position"]):
if value == "" and blank == True:
concatenated_pages["Rank position"][index] = "NA"
blank = False
elif value == "":
blank = True
else:
blank = False
for k,v in concatenated_pages.items():
concatenated_pages[k] = [i for i in v if i is not ""]
find_blanks_and_remove_whitespace(concatenated_pages)
final_representation = [list(concatenated_pages.keys())]
index = 0
for rowindex, i in enumerate(concatenated_pages["rowname"]):
onecount = 0
while index < len(concatenated_pages["Sequence"]):
print(index, rowindex)
rankpos = concatenated_pages["Rank position"][index]
print(rankpos)
if "NA" == rankpos or "1" in rankpos and len(rankpos) <= 2:
onecount += 1
if onecount > 1:
break
# print([concatenated_pages["Design"][rowindex]])
item = [val[index] for val in list(concatenated_pages.values())[0:-2]] + [i] + [concatenated_pages["Design"][rowindex]]
final_representation.append(item)
index += 1
import csv
with open("output.csv", "w", newline="") as f:
writer = csv.writer(f, delimiter="\t",
quotechar="'", quoting=csv.QUOTE_MINIMAL)
[writer.writerow(i) for i in final_representation]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment