python - Vendor and products details extract from Gmail - Stack Overflow-软件玩家

admin管理员组
文章数量:1242774

I tried to extract vendor and product details from an email into an Excel sheet, including fields like **Mail_Date, Mail_Subject, Product_Name, Product_Quantity, Product_Price, Vendor_Name, Vendor_Email, Vendor_Phone, Vendor_Address, Vendor_GST_No, and Vendor_Website. **

However, I'm facing issues with the regular expressions. When the data is in a consistent structure, I can get partial output, but if the data is unstructured, the results are completely off, such as showing random values like "aaa," "676776," or "0000" etc...

How can I resolve this issue?

I have also tried using ML methods with Spacy, but they yield incorrect outputs. Any suggestions?

import re
import yaml
import imaplib
import spacy
import pandas as pd
from email import message_from_bytes
from bs4 import BeautifulSoup
from dateutil import parser

nlp = spacy.load("en_core_web_sm")

class ProcurementEmailParser:
    def __init__(self, credentials_path):
        self.credentials = self.load_credentials(credentials_path)
        self.mail = None
        # Not include Vendor_Address, Vendor_GST_No
        self.output_columns = [
        'Mail_Date', 'Mail_Subject', 'Product_Name', 
        'Product_Quantity', 'Product_Price',
        'Vendor_Name', 'Vendor_Email', 'Vendor_Phone', 'Vendor_Website'
    ]
    
    def load_credentials(self, path):
        with open(path) as f:
            return yaml.safe_load(f)
    
    def connect_email(self):
        self.mail = imaplib.IMAP4_SSL('imap.gmail')
        self.mail.login(self.credentials['email'], 
        self.credentials['password'])
        self.mail.select('inbox')
    
    def extract_vendor_info(self, text):
        vendor_info = {
            'Vendor_Name': '',
            'Vendor_Email': '',
            'Vendor_Phone': '',
            'Vendor_Website': ''
        }
        
        # Extract email
        email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\. [A-Z|a-z]{2,}\b', text)
        if email_match:
            vendor_info['Vendor_Email'] = email_match.group(0)
        
        # Extract phone
        phone_match = re.search(r'(?:\+?91[\s-]?)?[6-9]\d{9}', text)
        if phone_match:
            vendor_info['Vendor_Phone'] =phone_match.group(0).replace

(' ', '')

        # Extract website
        website_match = re.search(r'(?:https?://)?(?:www\.)?[\w.-]+\.[a-zA-Z]{2,}', text)
        if website_match:
            vendor_info['Vendor_Website'] = website_match.group(0)
        
        # Extract vendor name from signature
        if 'Regards,' in text:
            signature_block = text.split('Regards,')[-1].strip()
            lines = [line.strip() for line in signature_block.split('\n') if line.strip()]
            if lines:
                vendor_info['Vendor_Name'] = re.sub(r'[^a-zA-Z\s]', '', lines[0]).strip()
        
        return vendor_info
    
    def safe_float_conversion(self, value):
        try:
            return float(value.replace(',', '')) if value else None
        except:
            return None
    
    def extract_products(self, text):
        products = []
        
        # Pattern 1: Table format with price
        table_pattern = r'(\d+)\s+(.+?)\s+(\d+)\s+([\d,]+)\s+([\d,]+)'
        for match in re.finditer(table_pattern, text):
            products.append({
                'Product_Name': match.group(2).strip(),
                'Product_Quantity': int(match.group(3)),
                'Product_Price': self.safe_float_conversion(match.group(5))})
        
        # Pattern 2: Line items with optional price
        line_pattern = r'(.+?)\s+[-–]\s+(\d+)\s*(?:nos|units|qty)\s*[-–]?\s*([₹$]?[\d,]+)?'
        for match in re.finditer(line_pattern, text, re.IGNORECASE):
            product_name = match.group(1).strip()
            quantity = int(match.group(2))
            price = self.safe_float_conversion(match.group(3)) if match.group(3) else None
            products.append({
                'Product_Name': product_name,
                'Product_Quantity': quantity,
                'Product_Price': price
            })
        
        # Pattern 3: NLP-based extraction
        if not products:
            doc = nlp(text)
            current_product = {'name': '', 'qty': None, 'price': None}
            for ent in doc.ents:
                if ent.label_ == 'PRODUCT':
                    current_product['name'] = ent.text
                elif ent.label_ == 'QUANTITY' and 'nos' in ent.text.lower():
                    current_product['qty'] = int(re.search(r'\d+', ent.text).group())
                elif ent.label_ == 'MONEY':
                    current_product['price'] = self.safe_float_conversion(ent.text)
                
                if current_product['name'] and current_product['qty']:
                    products.append({
                        'Product_Name': current_product['name'],
                        'Product_Quantity': current_product['qty'],
                        'Product_Price': current_product['price']
                    })
                    current_product = {'name': '', 'qty': None, 'price': None}
        
        return products
    
    def process_email(self, email_msg):
        try:
            # Extract email content
            text_content = ''
            for part in email_msg.walk():
                if part.get_content_type() == 'text/plain':
                    text_content += part.get_payload(decode=True).decode('utf-8', 'ignore')
                elif part.get_content_type() == 'text/html':
                    html_content = part.get_payload(decode=True).decode('utf-8', 'ignore')
                    soup = BeautifulSoup(html_content, 'html.parser')
                    text_content += '\n' + soup.get_text(separator=' ', strip=True)
            
            # Clean text
            text_content = re.sub(r'\s+', ' ', text_content).strip()
            
            # Extract vendor info
            vendor_info = self.extract_vendor_info(text_content)
            
            # Extract products
            products = self.extract_products(text_content)
            
            # Create records
            records = []
            for product in products:
                record = {
                    'Mail_Date': parser.parse(email_msg['Date']).strftime('%Y-%m-%d %H:%M:%S'),
                    'Mail_Subject': email_msg.get('Subject',

'No Subject'), **product, **vendor_info } records.append(record)

            return records
        
        except Exception as e:
            print(f"Error processing email: {str(e)}")
            return []
    
    def process_emails(self, limit=50, save_path='output.xlsx'):
        self.connect_email()
        _, msg_ids = self.mail.search(None, 'ALL')
        all_data = []
        
        for msg_id in msg_ids[0].split()[-limit:]:
            try:
                _, msg_data = self.mail.fetch(msg_id, '(RFC822)')
                email_msg = message_from_bytes(msg_data[0][1])
                all_data.extend(self.process_email(email_msg))
            except Exception as e:
                print(f"Error processing email {msg_id.decode()}: {str(e)}")
                continue
        
        # Save to Excel
        df = pd.DataFrame(all_data, columns=self.output_columns)
        df.to_excel(save_path, index=False)
        return df

if __name__ == "__main__":
    parser = ProcurementEmailParser("C:\\Users\\one\\credentials.yml")
    df = parser.process_emails(limit=50, save_path="C:\\Users\\one\\vp_details.xlsx")
    print(f"Successfully processed {len(df)} records")

本文标签： pythonVendor and products details extract from GmailStack Overflow

版权声明：本文标题：python - Vendor and products details extract from Gmail - Stack Overflow 内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.betaflare.com/web/1740160528a2234343.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

编程频道|软件玩家 - 软件改变生活！

python - Vendor and products details extract from Gmail - Stack Overflow

更多相关文章