admin管理员组

文章数量:1404054

My use case contains textual table data but the column header cell values have multiple lines in them(image shared). which results in bad parsing by PyMuPdf. I have tried Camelot and Tabula as well, but this issue is common among all.

Can someone please suggest another method or a setting that I need to change/tune in order to get better and accurate parsing of such tables

import fitz  # PyMuPDF
import pandas as pd

def extract_table_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    data = []

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)

        # Extract tables
        tables = page.find_tables()

        print(f"Page {page_num+1}: Found tables -> {tables.tables}")  # Debugging
        
        if not tables.tables:  # If no tables found, skip
            continue

        for table in tables.tables:  # Iterate over detected tables
            table_data = table.extract()  # Extract table contents
            data.extend(table_data)  # Store table data

    return data

def main(pdf_path, output_filename):
    table_data = extract_table_from_pdf(pdf_path)

    # Convert extracted table to DataFrame and save as Excel
    df = pd.DataFrame(table_data)
    df.to_excel(output_filename, index=False)

    print(f"Table extracted and saved to {output_filename}")

if __name__ == "__main__":
    pdf_path = "two_pages.pdf"  # Change this
    output_filename = "output_two.xlsx"
    main(pdf_path, output_filename)

parsed result:

本文标签: python camelothandling complex tables with PyMuPdfStack Overflow