admin管理员组

文章数量:1345179

I am trying to do as the topic says. According to some sources, it should be possible to extract comments in CSV format, but I only have FDF and XFDF formats available, and I've tried using both Acrobat Reader and PDFXchange Editor.

After creating an FDF file (I tried an XFDF file as well, but that file seems to only contain some decrypted interpretation of the text I'm after so my scripts can't find anything), I have tried to write both a Python script and a VBA macro to extract the data I need from the FDF file, but for some reason neither script manages to correctly paste the data into the Excel sheet even though they both manage to find it. I used print() and Msg Box to verify.

I have also tried importing the comments from the PDF file from Excel, using power queries, but it doesn't find comments specifically and any data it finds isn't structured anywhere close to presentable.

Below are first my VBA macro, and secondly my Python script:

VBA:

Sub ImportCommentsFromFDF()
    Dim fdfFilePath As String
    Dim excelFilePath As String
    Dim fdfFileNum As Integer
    Dim line As String
    Dim commentator As String
    Dim comment As String
    Dim commentDate As String
    Dim rowNum As Long
    Dim ws As Worksheet
    Dim wb As Workbook
    Dim commenters As Collection
    Dim comments As Collection
    Dim commentDates As Collection

    fdfFilePath = ThisWorkbook.Path & "\comments.fdf"
    excelFilePath = ThisWorkbook.Path & "\Comments.xlsx"
    
    If Dir(fdfFilePath) = "" Then
        MsgBox "Error: The file '" & fdfFilePath & "' does not exist."
        Exit Sub
    End If

    If Dir(excelFilePath) = "" Then
        MsgBox "Error: The file '" & excelFilePath & "' does not exist. A new file will be created."
        Exit Sub
    End If

    Set wb = Workbooks.Open(excelFilePath)
    Set ws = wb.Sheets("Sheet1")

    Set commenters = New Collection
    Set comments = New Collection
    Set commentDates = New Collection

    fdfFileNum = FreeFile
    Open fdfFilePath For Input As fdfFileNum
    
    Do While Not EOF(fdfFileNum)
        Line Input #fdfFileNum, line
        
        commentator = ""
        comment = ""
        commentDate = ""

        If InStr(line, "/T (") > 0 Then
            commentator = Trim(Mid(line, InStr(line, "(") + 1, InStr(line, ")") - InStr(line, "(") - 1))
            Debug.Print "Found Commentator: " & commentator
        End If
        
        If InStr(line, "/Contents (") > 0 Then
            comment = Trim(Mid(line, InStr(line, "(") + 1, InStr(line, ")") - InStr(line, "(") - 1))
            Debug.Print "Found Comment: " & comment
        End If
        

        If InStr(line, "/CreationDate (") > 0 Then
            commentDate = Trim(Mid(line, InStr(line, "(") + 1, InStr(line, ")") - InStr(line, "(") - 1))
            Debug.Print "Found Comment Date: " & commentDate
        End If
        
        If commentator <> "" And comment <> "" And commentDate <> "" Then
            commenters.Add commentator
            comments.Add comment
            commentDates.Add commentDate
        End If
    Loop
    
    Close fdfFileNum
    
    If commenters.Count = comments.Count And comments.Count = commentDates.Count Then
        rowNum = 2
        For i = 1 To commenters.Count
            ws.Cells(rowNum, 1).Value = commenters(i)
            ws.Cells(rowNum, 2).Value = comments(i)
            ws.Cells(rowNum, 3).Value = commentDates(i)
            rowNum = rowNum + 1
        Next i
        
        wb.Save
        MsgBox "Comments imported successfully!"
    Else
        MsgBox "Warning: The lists are not of the same length." & vbCrLf & _
               "Commenters: " & commenters.Count & vbCrLf & _
               "Comments: " & comments.Count & vbCrLf & _
               "Dates: " & commentDates.Count
    End If

    wb.Close SaveChanges:=True
End Sub

Python:

import pandas as pd
import os

def import_comments_from_fdf(fdf_file_path, excel_file_path):
    if not os.path.isfile(fdf_file_path):
        print(f"Error: The file '{fdf_file_path}' does not exist.")
        return

    if not os.path.isfile(excel_file_path):
        print(f"Error: The file '{excel_file_path}' does not exist. A new file will be created.")

    commenters = []
    comments = []
    comment_dates = []

    with open(fdf_file_path, 'r', encoding='ISO-8859-1') as fdf_file:
        for line in fdf_file:
            commentator = ""
            comment = ""
            comment_date = ""

            if '/T (' in line:
                commentator = line[line.index('(') + 1:line.index(')')].strip()
                print(f"Found Commentator: {commentator}")

            if '/Contents (' in line:
                comment = line[line.index('(') + 1:line.index(')')].strip()
                print(f"Found Comment: {comment}")

            if '/CreationDate (' in line:
                comment_date = line[line.index('(') + 1:line.index(')')].strip()
                print(f"Found Comment Date: {comment_date}")

            if commentator and comment and comment_date:
                commenters.append(commentator)
                comments.append(comment)
                comment_dates.append(comment_date)

    if len(commenters) == len(comments) == len(comment_dates):
        data = {
            'Commenter': commenters,
            'Comment': comments,
            'Date Created': comment_dates
        }
        df = pd.DataFrame(data)

        print("DataFrame to be written:")
        print(df)

        with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
            for index, row in df.iterrows():
                writer.sheets['Sheet1'].cell(row=index + 2, column=1, value=row['Commenter'])
                writer.sheets['Sheet1'].cell(row=index + 2, column=2, value=row['Comment'])
                writer.sheets['Sheet1'].cell(row=index + 2, column=3, value=row['Date Created'])

        print("Comments imported successfully!")
    else:
        print("Warning: The lists are not of the same length.")
        print(f"Commenters: {len(commenters)}, Comments: {len(comments)}, Dates: {len(comment_dates)}")

fdf_file_path = 'comments.fdf'
excel_file_path = 'Comments.xlsx'

import_comments_from_fdf(fdf_file_path, excel_file_path)

本文标签: How to extract comments from a PDF and put them into an Excel sheetStack Overflow