From df23e520fa628eb665f70de1a6438fc692c9551f Mon Sep 17 00:00:00 2001 From: Alex Selimov Date: Fri, 18 Apr 2025 16:27:18 -0400 Subject: [PATCH] Add maildir parser and tests --- src/maildirclean/maildir.py | 57 +++++++++++++++++++--- tests/fixtures.py | 47 ++++++++++++++++-- tests/test_maildir.py | 97 +++++++++++++++++++++++++++++++++++-- 3 files changed, 188 insertions(+), 13 deletions(-) diff --git a/src/maildirclean/maildir.py b/src/maildirclean/maildir.py index 69e568f..b13500b 100644 --- a/src/maildirclean/maildir.py +++ b/src/maildirclean/maildir.py @@ -1,10 +1,11 @@ -import pathlib +from pathlib import Path +import pandas as pd METADATA_SCHEMA = sorted(["Path", "From", "Date"]) -def make_email_metadata(email_path: str) -> dict[str, str]: +def make_email_metadata(email_path: str | Path) -> dict[str, str]: """Make an email metadata object by parsing the email contents Args: @@ -14,7 +15,7 @@ def make_email_metadata(email_path: str) -> dict[str, str]: Returns: Dict containing the required metadata """ key_is_set = {key: False for key in METADATA_SCHEMA} - metadata = {"Path": email_path} + metadata = {"Path": str(email_path)} key_is_set["Path"] = True with open(email_path, "r") as f: @@ -35,7 +36,7 @@ def make_email_metadata(email_path: str) -> dict[str, str]: return metadata -def parse_maildir(path_to_dir: str | pathlib.Path): +def parse_maildir(path_to_dir: str | Path): """Parse all of the emails within the specified maildir box (not recursively) Args: @@ -44,7 +45,17 @@ def parse_maildir(path_to_dir: str | pathlib.Path): Returns: MailDir object initialized with email information """ - return MailDir(_) + file_list = Path(path_to_dir).glob("*") + email_metadata = [make_email_metadata(file) for file in file_list] + return MailDir(email_metadata) + + +class TopSender: + """Simple class to store the top sender alongside the first 5 names they used""" + + def __init__(self, email: str, names: list[str]): + self.email = email + self.names = names class MailDir: @@ -52,5 +63,39 @@ class MailDir: Stores the metadata associated with all local emails. """ + KEYS_AND_FUNCS = { + "Name": lambda df: df["From"].map(lambda x: x.split("<")[0].strip('" ')), + "Email": lambda df: df["From"].map(lambda x: x.split("<")[1].strip(">")), + "Date": lambda df: pd.to_datetime(df["Date"]), + } + def __init__(self, email_metadata: list[dict[str, str]]): - pass + if email_metadata: + self._df = pd.DataFrame(email_metadata) + + for k, func in self.KEYS_AND_FUNCS.items(): + self._df[k] = func(self._df) + + else: + self._df = pd.DataFrame( + columns=METADATA_SCHEMA + list(self.KEYS_AND_FUNCS.keys()) + ) + + def get_top_n_senders(self, n: int) -> list[TopSender]: + """Calculate the top n senders and returns their information as a TopSender object + The TopSender object + Args: + n: Number of senders to retrieve + + Returns: list of TopSender objects + + """ + + senders = [ + TopSender( + email, list(self._df.loc[self._df["Email"] == email, "Name"].unique()) + ) + for email in self._df["Email"].value_counts().iloc[0:n].index + ] + + return senders diff --git a/tests/fixtures.py b/tests/fixtures.py index 4814d1d..1ac2b79 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -7,7 +7,7 @@ from tempfile import TemporaryDirectory def tmp_dir(): tmp_dir = TemporaryDirectory(delete=False) print("Created temporary directory ", tmp_dir.name) - yield tmp_dir.name + yield Path(tmp_dir.name) print("Cleaned temporary directory", tmp_dir.name) tmp_dir.cleanup() @@ -46,7 +46,7 @@ def test_email(tmp_dir): X-Google-Smtp-Source: AGHT+IGQyWQO69p9mhCHt5N5NbKLfb9Ij9fgRFGjk+UJNpRo3S9VPDV6pXXucyU0xAL3AiT5jNtO16w= X-Received: by 2002:a25:fb02:0:b0:664:f31a:2be0 with SMTP id u2-20020a25fb02000000b00664f31a2be0mr13538287ybg.36.1713373420812; Wed, 16 Apr 2025 09:23:40 -0700 (PDT) - From: "John Doe" sender@example.com + From: "John Doe" To: recipient@example.org Subject: Sample email for parsing exercises Date: Wed, 16 Apr 2025 12:23:35 -0400 @@ -92,7 +92,46 @@ def test_email(tmp_dir): Phone: (555) 123-4567 """ - email_path = Path(tmp_dir) / "test_email.txt" + email_path = tmp_dir / "test_email.txt" with open(email_path, "w") as f: f.write(test_contents) - yield email_path + return email_path + + +@pytest.fixture +def sample_email_dir(tmp_dir): + # Include only the necessary meta data since we have validated single email parsing + sample_emails = [ + """ + From: Test + Date: Wed, 10 Apr 2025 12:23:35 -0400 + """, + """ + From: Not a Test + Date: Wed, 16 Apr 2024 08:23:35 -0400 + """, + """ + From: "Test2" + Date: Wed, 11 Apr 2025 12:23:35 -0400 + """, + ] + + for i, email in enumerate(sample_emails): + with open(tmp_dir / f"{i}", "w") as f: + f.write(email) + return tmp_dir + + +@pytest.fixture +def sample_email_metadata(): + return [ + {"From": "John Doe ", "Date": "2025-01-01"}, + {"From": "John Doe ", "Date": "2025-01-02"}, + {"From": "Johnny Doe ", "Date": "2025-01-03"}, + {"From": "J. Doe ", "Date": "2025-01-04"}, + {"From": "Jane Smith ", "Date": "2025-01-05"}, + {"From": "Jane S. ", "Date": "2025-01-06"}, + {"From": "Alex Johnson ", "Date": "2025-01-07"}, + {"From": "Alex J. ", "Date": "2025-01-08"}, + {"From": "Sarah Williams ", "Date": "2025-01-09"}, + ] diff --git a/tests/test_maildir.py b/tests/test_maildir.py index b508107..4351420 100644 --- a/tests/test_maildir.py +++ b/tests/test_maildir.py @@ -1,11 +1,102 @@ +import pandas as pd + from fixtures import * -from maildirclean.maildir import make_email_metadata +from maildirclean.maildir import make_email_metadata, MailDir, TopSender, parse_maildir def test_email_parsing(test_email): metadata = make_email_metadata(test_email) - assert metadata["From"] == '"John Doe" sender@example.com' + assert metadata["From"] == '"John Doe" ' assert metadata["Date"] == "Wed, 16 Apr 2025 12:23:35 -0400" - assert metadata["Path"] == test_email + assert metadata["Path"] == str(test_email) + + +def test_maildir_creation(test_email): + maildir = MailDir([make_email_metadata(test_email)]) + + metadata = maildir._df.iloc[0] + assert metadata["From"] == '"John Doe" ' + assert metadata["Name"] == "John Doe" + assert metadata["Email"] == "sender@example.com" + assert metadata["Date"] == pd.to_datetime("Wed, 16 Apr 2025 12:23:35 -0400") + assert metadata["Path"] == str(test_email) + + +def test_get_top_n_senders(sample_email_metadata): + # Initialize MailDir with sample data + maildir = MailDir(sample_email_metadata) + + # Test getting top 2 senders + top_senders = maildir.get_top_n_senders(2) + + # Assertions + assert len(top_senders) == 2 + + # john.doe@example.com should be the top sender (4 emails) + assert top_senders[0].email == "john.doe@example.com" + assert set(top_senders[0].names) == {"John Doe", "Johnny Doe", "J. Doe"} + + # jane.smith@example.com should be the second (2 emails) + assert top_senders[1].email == "jane.smith@example.com" + assert set(top_senders[1].names) == {"Jane Smith", "Jane S."} + + +def test_get_top_n_senders_with_empty_data(): + # Initialize MailDir with empty data + maildir = MailDir([]) + + # Test getting top senders from empty data + top_senders = maildir.get_top_n_senders(5) + + # Should return empty list + assert len(top_senders) == 0 + + +def test_get_top_n_senders_with_n_greater_than_unique_senders(sample_email_metadata): + # Initialize MailDir with sample data + maildir = MailDir(sample_email_metadata) + + # Test getting more senders than exist + top_senders = maildir.get_top_n_senders(10) + + # Should only return 4 senders (as there are only 4 unique emails) + assert len(top_senders) == 4 + + # Verify all expected emails are present + emails = [sender.email for sender in top_senders] + assert set(emails) == { + "john.doe@example.com", + "jane.smith@example.com", + "alex.johnson@example.com", + "sarah@example.com", + } + + +def test_get_top_n_senders_ordering(sample_email_metadata): + # Initialize MailDir with sample data + maildir = MailDir(sample_email_metadata) + + # Test getting all senders + top_senders = maildir.get_top_n_senders(4) + + # Verify ordering by count + assert [sender.email for sender in top_senders] == [ + "john.doe@example.com", # 4 emails + "jane.smith@example.com", # 2 emails + "alex.johnson@example.com", # 2 emails + "sarah@example.com", # 1 email + ] + + +def test_parse_maildir(sample_email_dir): + maildir = parse_maildir(sample_email_dir) + + assert len(maildir._df) == 3 + assert "test@something.org" in list(maildir._df["Email"]) + assert "not_a_test@something.org" in list(maildir._df["Email"]) + + assert "Test" in list(maildir._df["Name"]) + assert "Not a Test" in list(maildir._df["Name"]) + assert "Test2" in list(maildir._df["Name"])