Created
June 26, 2025 13:45
-
-
Save IllustratedMan-code/e39c6e70c0e8e12e35f81a025bb2e880 to your computer and use it in GitHub Desktop.
A python script that extracts the text from a particular pdf document
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import fitz | |
| doc = fitz.open("sapp_pwms.pdf") | |
| pages = [d.get_text("dict") for d in doc] | |
| def inspect(page, block): | |
| text_in_block = [] | |
| return [{"text":j["text"], "coords": list(j["bbox"])} | |
| for i in page["blocks"][block]["lines"] | |
| for j in i["spans"] if not j["text"] == ""] | |
| for i in page["blocks"][block]["lines"]: | |
| for j in i["spans"]: | |
| if not j["text"]: | |
| text_in_block += [{"text":j["text"], "coords": list(j["bbox"])}] | |
| return text_in_block | |
| def merge_text(i, j, sep=""): | |
| i["text"] = i["text"] + sep + j["text"] | |
| i["coords"][0] = min(i["coords"][0], j["coords"][0]) | |
| i["coords"][1] = min(i["coords"][1], j["coords"][1]) | |
| i["coords"][2] = max(i["coords"][2], j["coords"][2]) | |
| i["coords"][3] = max(i["coords"][3], j["coords"][3]) | |
| title = "".join([d["text"] for d in inspect(pages[0], 0)]) | |
| columns = [d for d in inspect(pages[0], 1) if not (d["text"].isspace())] | |
| del pages[0]["blocks"][0] # deletes title block | |
| del pages[0]["blocks"][0] # deletes column block | |
| def merge_block(block, sep="|"): | |
| x_coords = {s["coords"][0]:i for i,s in enumerate(block)} | |
| x_coords = {} | |
| for i,s in enumerate(block): | |
| if s["coords"][0] not in x_coords: | |
| x_coords[s["coords"][0]] = i | |
| else: | |
| index = x_coords[s["coords"][0]] | |
| merge_text(block[index], block[i], sep=sep) | |
| del block[i] | |
| # print(block) | |
| # print(x_coords) | |
| merge_block(columns, sep="") # fixes columns with long names | |
| for index, i in enumerate(columns): | |
| i["text"] = i["text"].strip() # fixes whitespace | |
| if index < len(columns)-1: | |
| i["coords"][2] = columns[index+1]["coords"][0] | |
| def column_assign(text): | |
| rowname = {"text": "rowname", "coords": (0, 0, columns[0]["coords"][0])} | |
| for i in [rowname] + columns: | |
| if text["coords"][0] < i["coords"][2]: | |
| # print(columns) | |
| # print(text) | |
| return i["text"] | |
| sum = 0 | |
| def parse_pwm(pwm_column): | |
| global sum | |
| pwms = [] | |
| current_pwm = {"A":[], "C":[], "G":[], "T":[]} | |
| mode = "0" | |
| for i in pwm_column: | |
| if any(x in i for x in ["A", "C", "G", "T"]): | |
| previous_mode = mode | |
| mode = i[0] | |
| if mode == "A" and previous_mode == "T": | |
| pwms.append(current_pwm) | |
| current_pwm = {"A":[], "C":[], "G":[], "T":[]} | |
| continue | |
| if mode != "0": | |
| current_pwm[mode].append(i) | |
| pwms.append(current_pwm) | |
| return pwms | |
| def parse_page(page): | |
| column_assignments = {} | |
| for i in range(len(page["blocks"])): | |
| i = inspect(page, i) | |
| if len(i) == 0: | |
| continue | |
| if column_assign(i[0]) != "Frecuency PWM": | |
| merge_block(i) | |
| for j in i: | |
| assignment = column_assign(j) | |
| if assignment not in column_assignments: | |
| column_assignments[assignment] = [j["text"].strip()] | |
| else: | |
| column_assignments[assignment].append(j["text"].strip()) | |
| del column_assignments["Design"][-1] # delete page number | |
| global sum | |
| # sum += column_assignments["Frecuency PWM"].count("T:") | |
| column_assignments["Frecuency PWM"] = parse_pwm(column_assignments["Frecuency PWM"]) | |
| #sum += len(column_assignments["Frecuency PWM"]) | |
| return column_assignments | |
| all_pages = [parse_page(pg) for pg in pages] | |
| concatenated_pages = {} | |
| for i in all_pages: | |
| for key,value in i.items(): | |
| if key in concatenated_pages: | |
| concatenated_pages[key] += value | |
| else: | |
| concatenated_pages[key] = value | |
| def find_blanks_and_remove_whitespace(concatenated_pages): | |
| blank = False | |
| for index, value in enumerate(concatenated_pages["Rank position"]): | |
| if value == "" and blank == True: | |
| concatenated_pages["Rank position"][index] = "NA" | |
| blank = False | |
| elif value == "": | |
| blank = True | |
| else: | |
| blank = False | |
| for k,v in concatenated_pages.items(): | |
| concatenated_pages[k] = [i for i in v if i is not ""] | |
| find_blanks_and_remove_whitespace(concatenated_pages) | |
| final_representation = [list(concatenated_pages.keys())] | |
| index = 0 | |
| for rowindex, i in enumerate(concatenated_pages["rowname"]): | |
| onecount = 0 | |
| while index < len(concatenated_pages["Sequence"]): | |
| print(index, rowindex) | |
| rankpos = concatenated_pages["Rank position"][index] | |
| print(rankpos) | |
| if "NA" == rankpos or "1" in rankpos and len(rankpos) <= 2: | |
| onecount += 1 | |
| if onecount > 1: | |
| break | |
| # print([concatenated_pages["Design"][rowindex]]) | |
| item = [val[index] for val in list(concatenated_pages.values())[0:-2]] + [i] + [concatenated_pages["Design"][rowindex]] | |
| final_representation.append(item) | |
| index += 1 | |
| import csv | |
| with open("output.csv", "w", newline="") as f: | |
| writer = csv.writer(f, delimiter="\t", | |
| quotechar="'", quoting=csv.QUOTE_MINIMAL) | |
| [writer.writerow(i) for i in final_representation] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment