Add file deletion capability and tests

Rework cli to accept ranges
Updte to include email count in top senders list
2025-04-21 22:04:08 -04:00 · 2025-04-21 22:03:32 -04:00 · 2025-04-19 10:34:28 -04:00 · 2025-04-19 10:25:13 -04:00
10 changed files with 361 additions and 45 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -3,13 +3,9 @@ name = "maildirclean"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
-authors = [
-    { name = "Alex Selimov", email = "alex@alexselimov.com" }
-]
+authors = [{ name = "Alex Selimov", email = "alex@alexselimov.com" }]
 requires-python = ">=3.13"
-dependencies = [
-    "pandas>=2.2.3",
-]
+dependencies = ["pandas>=2.2.3"]

 [project.scripts]
 maildirclean = "maildirclean:main"
@ -19,6 +15,4 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"

 [dependency-groups]
-dev = [
-    "pytest>=8.3.5",
-]
+dev = ["pytest>=8.3.5"]
--- a/src/maildirclean/init.py
+++ b/src/maildirclean/init.py
@ -1,2 +1,5 @@
-def main() -> None:
-    print("Hello from maildirclean!")
+from .cli import cli
+
+
+def main() -> int:
+    return cli()
--- a/src/maildirclean/cli.py
+++ b/src/maildirclean/cli.py
@ -0,0 +1,160 @@
+import argparse
+import sys
+from pathlib import Path
+
+from .maildir import MailDir, TopSender, parse_maildir
+
+
+def parse_arguments() -> argparse.Namespace:
+    """Parse command line arguments
+
+    Returns: Namespace object corresponding to parsed arguments
+
+    """
+    parser = argparse.ArgumentParser(
+        description="Analyze email metadata from a maildir directory"
+    )
+    parser.add_argument(
+        "maildir", type=str, help="Path to the maildir directory to analyze"
+    )
+    parser.add_argument(
+        "--top",
+        "-t",
+        type=int,
+        default=5,
+        help="Number of top senders to display (default: 5)",
+    )
+    parser.add_argument(
+        "--verbose", "-v", action="store_true", help="Enable verbose output"
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def cli():
+    args = parse_arguments()
+
+    maildir_path = Path(args.maildir)
+    if not maildir_path.exists() or not maildir_path.is_dir():
+        print(f"Error: {args.maildir} is not a valid directory", file=sys.stderr)
+        return 1
+
+    if args.verbose:
+        print(f"Analyzing emails in {maildir_path}...")
+
+    run_loop(args, maildir_path)
+
+    return 0
+
+
+def run_loop(args: argparse.Namespace, maildir_path: str | Path):
+
+    # Set up maildir
+    maildir = parse_maildir(maildir_path)
+    if args.verbose:
+        print(f"Found {len(maildir._df)} emails")
+
+    # Main running loop
+    while True:
+        top_senders = maildir.get_top_n_senders(args.top)
+
+        if not top_senders:
+            print("No senders found in the maildir", file=sys.stderr)
+            return 0
+
+        result = []
+        for i, sender in enumerate(top_senders, 1):
+            names_str = ", ".join(sender.names[:5])  # Limit to first 5 names
+            if len(sender.names) > 5:
+                names_str += f" and {len(sender.names) - 5} more"
+
+            result.append(
+                f"{i}. {sender.email} - Email count: {sender.count} - Names used: {names_str}"
+            )
+
+        output = "\n".join(
+            [f"Top {len(top_senders)} senders in {maildir_path}:", "=" * 40, *result]
+        )
+
+        print(output)
+        if not user_input_loop(top_senders, maildir):
+            break
+
+
+def user_input_loop(top_senders: list[TopSender], maildir: MailDir) -> bool:
+    user_input = input("> ").strip()
+    while handle_user_input(user_input, top_senders, maildir):
+        user_input = input("> ").strip()
+
+
+def parse_selections(user_input, max_selection):
+    """
+    Parse user input into a set of valid selections.
+
+    Args:
+        user_input (str): User input string with numbers, comma-separated lists, or ranges
+        max_selection (int): Maximum allowed selection number
+
+    Returns:
+        set: Set of valid selection numbers
+
+    Raises:
+        ValueError: If any input is invalid or out of range
+    """
+    selections = set()
+
+    # Clean up the input
+    user_input = user_input.strip()
+
+    # Split by comma
+    items = [item.strip() for item in user_input.split(",")]
+
+    for item in items:
+        if "-" in item:
+            # Handle range (e.g., "1-3" or "4 - 5")
+            range_parts = [part.strip() for part in item.split("-")]
+            if (
+                len(range_parts) != 2
+                or not range_parts[0].isdigit()
+                or not range_parts[1].isdigit()
+            ):
+                raise ValueError(f"Invalid range format: {item}")
+
+            start = int(range_parts[0])
+            end = int(range_parts[1])
+
+            if start > end:
+                raise ValueError(
+                    f"Invalid range: {item}. Start must be less than or equal to end."
+                )
+
+            selections.update(range(start, end + 1))
+        elif item.isdigit():
+            # Handle single number
+            selections.add(int(item))
+        else:
+            raise ValueError(f"Invalid input: {item}")
+
+    # Check if any selection is out of range
+    out_of_range = [s for s in selections if s < 1 or s > max_selection]
+    if out_of_range:
+        raise ValueError(
+            f"Selection(s) out of range: {', '.join(map(str, out_of_range))}. Valid range is 1-{max_selection}"
+        )
+
+    return selections
+
+
+def handle_user_input(user_input, top_senders, maildir):
+    if user_input.lower() == "q":
+        return False
+
+    try:
+        selections = parse_selections(user_input, len(top_senders))
+        for selection in selections:
+            selected_sender = top_senders[selection - 1]
+            print(f"Selected {selected_sender.email}")
+        return True
+    except ValueError:
+        print("Please enter a valid number or 'q' to quit")
--- a/src/maildirclean/filedir.py
+++ b/src/maildirclean/filedir.py
@ -0,0 +1,14 @@
+"""Module containing functionality to interact with the filesystem"""
+
+import os
+from pathlib import Path
+
+
+def delete_files(file_list: list[str | Path]):
+    """Delete all files in the provided file list
+
+    Args:
+        file_list: List of file paths as either strings or Path
+    """
+    for file in file_list:
+        os.remove(file)
--- a/src/maildirclean/maildir.py
+++ b/src/maildirclean/maildir.py
@ -1,8 +1,11 @@
 from pathlib import Path
 import pandas as pd
+import re
+
+from .utility import first_match_or_empty, to_datetime_safe


-METADATA_SCHEMA = sorted(["Path", "From", "Date"])
+METADATA_SCHEMA = sorted(["path", "from", "date"])


 def make_email_metadata(email_path: str | Path) -> dict[str, str]:
@ -15,13 +18,17 @@ def make_email_metadata(email_path: str | Path) -> dict[str, str]:
    Returns: Dict containing the required metadata
    """
    key_is_set = {key: False for key in METADATA_SCHEMA}
-    metadata = {"Path": str(email_path)}
-    key_is_set["Path"] = True
+    metadata = {"path": str(email_path)}
+    key_is_set["path"] = True

-    with open(email_path, "r") as f:
+    with open(email_path, "rb") as f:
        for line in f:
            try:
-                k, v = [val.strip() for val in line.split(":", maxsplit=1)]
+                k, v = [
+                    val.strip()
+                    for val in line.decode(errors="ignore").split(":", maxsplit=1)
+                ]
+                k = k.lower()
                if k in METADATA_SCHEMA:
                    metadata[k] = v
                    key_is_set[k] = True
@ -51,11 +58,14 @@ def parse_maildir(path_to_dir: str | Path):


 class TopSender:
-    """Simple class to store the top sender alongside the first 5 names they used"""
+    """Simple class to store the top sender,
+    alongisde names they used and the count of emails sent by them
+    """

-    def __init__(self, email: str, names: list[str]):
+    def __init__(self, email: str, names: list[str], count: int):
        self.email = email
        self.names = names
+        self.count = count


 class MailDir:
@ -63,10 +73,17 @@ class MailDir:
    Stores the metadata associated with all local emails.
    """

+    name_regex = r"^(.*?)(?=<)"
+    email_regex = r"<?([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)>?"
+
    KEYS_AND_FUNCS = {
-        "Name": lambda df: df["From"].map(lambda x: x.split("<")[0].strip('" ')),
-        "Email": lambda df: df["From"].map(lambda x: x.split("<")[1].strip(">")),
-        "Date": lambda df: pd.to_datetime(df["Date"]),
+        "name": lambda df: df["from"].map(
+            lambda x: first_match_or_empty(MailDir.name_regex, x).strip('" ')
+        ),
+        "email": lambda df: df["from"].map(
+            lambda x: first_match_or_empty(MailDir.email_regex, x).strip("")
+        ),
+        "date": lambda df: df["date"].map(lambda x: to_datetime_safe(x)),
    }

    def __init__(self, email_metadata: list[dict[str, str]]):
@ -91,11 +108,14 @@ class MailDir:

        """

+        unique_senders = self._df["email"].value_counts().iloc[0:n]
        senders = [
            TopSender(
-                email, list(self._df.loc[self._df["Email"] == email, "Name"].unique())
+                email,
+                list(self._df.loc[self._df["email"] == email, "name"].unique()),
+                count,
            )
-            for email in self._df["Email"].value_counts().iloc[0:n].index
+            for (email, count) in unique_senders.items()
        ]

        return senders
--- a/src/maildirclean/utility.py
+++ b/src/maildirclean/utility.py
@ -0,0 +1,28 @@
+import re
+import pandas as pd
+from datetime import datetime
+
+
+def first_match_or_empty(pattern: str, text: str) -> str:
+    """Get the first match for the provided pattern or "" if empty.
+    This is a wrapper to facilitate usage of the re.search in lambda expressions
+
+    Args:
+        pattern: Pattern to search for
+        text: Text that will be searched
+
+    Returns: First pattern match or ""
+
+    """
+    match = re.search(pattern, text)
+    if match:
+        return match.group(1)
+    else:
+        return ""
+
+
+def to_datetime_safe(datetime_str: str):
+    try:
+        return pd.to_datetime(datetime_str, format="mixed", errors="coerce")
+    except (ValueError, AttributeError):
+        return datetime.now()
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@ -125,13 +125,13 @@ def sample_email_dir(tmp_dir):
@pytest.fixture
 def sample_email_metadata():
    return [
-        {"From": "John Doe <john.doe@example.com>", "Date": "2025-01-01"},
-        {"From": "John Doe <john.doe@example.com>", "Date": "2025-01-02"},
-        {"From": "Johnny Doe <john.doe@example.com>", "Date": "2025-01-03"},
-        {"From": "J. Doe <john.doe@example.com>", "Date": "2025-01-04"},
-        {"From": "Jane Smith <jane.smith@example.com>", "Date": "2025-01-05"},
-        {"From": "Jane S. <jane.smith@example.com>", "Date": "2025-01-06"},
-        {"From": "Alex Johnson <alex.johnson@example.com>", "Date": "2025-01-07"},
-        {"From": "Alex J. <alex.johnson@example.com>", "Date": "2025-01-08"},
-        {"From": "Sarah Williams <sarah@example.com>", "Date": "2025-01-09"},
+        {"from": "John Doe <john.doe@example.com>", "date": "2025-01-01"},
+        {"from": "John Doe <john.doe@example.com>", "date": "2025-01-02"},
+        {"from": "Johnny Doe <john.doe@example.com>", "date": "2025-01-03"},
+        {"from": "J. Doe <john.doe@example.com>", "date": "2025-01-04"},
+        {"from": "Jane Smith <jane.smith@example.com>", "date": "2025-01-05"},
+        {"from": "Jane S. <jane.smith@example.com>", "date": "2025-01-06"},
+        {"from": "Alex Johnson <alex.johnson@example.com>", "date": "2025-01-07"},
+        {"from": "Alex J. <alex.johnson@example.com>", "date": "2025-01-08"},
+        {"from": "Sarah Williams <sarah@example.com>", "date": "2025-01-09"},
    ]
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -0,0 +1,87 @@
+import pytest
+from pathlib import Path
+
+# Import the function to test - assuming it's in a module called 'maildir_analyzer'
+# Update this import to match your actual module structure
+from maildirclean.cli import parse_selections
+
+
+def test_single_number():
+    result = parse_selections("3", 5)
+    assert result == {3}
+
+
+def test_comma_separated_list():
+    result = parse_selections("1,3,5", 5)
+    assert result == {1, 3, 5}
+
+
+def test_range():
+    result = parse_selections("2-4", 5)
+    assert result == {2, 3, 4}
+
+
+def test_range_with_spaces():
+    result = parse_selections("1 - 3", 5)
+    assert result == {1, 2, 3}
+
+
+def test_combined_input():
+    result = parse_selections("1,3-5,7", 10)
+    assert result == {1, 3, 4, 5, 7}
+
+
+def test_duplicate_numbers():
+    result = parse_selections("1,1,2,2-4,3", 5)
+    assert result == {1, 2, 3, 4}  # Set removes duplicates
+
+
+def test_invalid_format():
+    with pytest.raises(ValueError, match="Invalid input: abc"):
+        parse_selections("abc", 5)
+
+
+def test_invalid_range_format():
+    with pytest.raises(ValueError, match="Invalid range format: 2-a"):
+        parse_selections("2-a", 5)
+
+
+def test_inverted_range():
+    with pytest.raises(ValueError, match="Invalid range: 5-2"):
+        parse_selections("5-2", 5)
+
+
+def test_out_of_range():
+    with pytest.raises(ValueError, match="Selection.*out of range"):
+        parse_selections("3,6,8", 5)
+
+
+def test_mix_valid_and_invalid():
+    with pytest.raises(ValueError):
+        parse_selections("1,abc,3", 5)
+
+
+def test_empty_input():
+    with pytest.raises(ValueError):
+        parse_selections("", 5)
+
+
+def test_whitespace_only():
+    with pytest.raises(ValueError):
+        parse_selections("   ", 5)
+
+
+def test_max_boundary():
+    # Test the boundary case
+    result = parse_selections("5", 5)
+    assert result == {5}
+
+
+def test_complex_input():
+    result = parse_selections("1-2, 4, 6-8", 10)
+    assert result == {1, 2, 4, 6, 7, 8}
+
+
+def test_negative_numbers():
+    with pytest.raises(ValueError, match="Invalid range format:*"):
+        parse_selections("-1,2", 5)
--- a/tests/test_filedir.py
+++ b/tests/test_filedir.py
@ -0,0 +1,10 @@
+from fixtures import *
+from maildirclean.filedir import delete_files
+
+
+def test_delete_files(sample_email_dir):
+    file_list = list(Path(sample_email_dir).glob("*"))
+    delete_files(file_list)
+
+    for file in file_list:
+        assert not Path(file).is_file()
--- a/tests/test_maildir.py
+++ b/tests/test_maildir.py
@ -8,20 +8,20 @@ def test_email_parsing(test_email):

    metadata = make_email_metadata(test_email)

-    assert metadata["From"] == '"John Doe" <sender@example.com>'
-    assert metadata["Date"] == "Wed, 16 Apr 2025 12:23:35 -0400"
-    assert metadata["Path"] == str(test_email)
+    assert metadata["from"] == '"John Doe" <sender@example.com>'
+    assert metadata["date"] == "Wed, 16 Apr 2025 12:23:35 -0400"
+    assert metadata["path"] == str(test_email)


 def test_maildir_creation(test_email):
    maildir = MailDir([make_email_metadata(test_email)])

    metadata = maildir._df.iloc[0]
-    assert metadata["From"] == '"John Doe" <sender@example.com>'
-    assert metadata["Name"] == "John Doe"
-    assert metadata["Email"] == "sender@example.com"
-    assert metadata["Date"] == pd.to_datetime("Wed, 16 Apr 2025 12:23:35 -0400")
-    assert metadata["Path"] == str(test_email)
+    assert metadata["from"] == '"John Doe" <sender@example.com>'
+    assert metadata["name"] == "John Doe"
+    assert metadata["email"] == "sender@example.com"
+    assert metadata["date"] == pd.to_datetime("Wed, 16 Apr 2025 12:23:35 -0400")
+    assert metadata["path"] == str(test_email)


 def test_get_top_n_senders(sample_email_metadata):
@ -94,9 +94,9 @@ def test_parse_maildir(sample_email_dir):
    maildir = parse_maildir(sample_email_dir)

    assert len(maildir._df) == 3
-    assert "test@something.org" in list(maildir._df["Email"])
-    assert "not_a_test@something.org" in list(maildir._df["Email"])
+    assert "test@something.org" in list(maildir._df["email"])
+    assert "not_a_test@something.org" in list(maildir._df["email"])

-    assert "Test" in list(maildir._df["Name"])
-    assert "Not a Test" in list(maildir._df["Name"])
-    assert "Test2" in list(maildir._df["Name"])
+    assert "Test" in list(maildir._df["name"])
+    assert "Not a Test" in list(maildir._df["name"])
+    assert "Test2" in list(maildir._df["name"])
Author	SHA1	Message	Date
Alex Selimov	e2698281c4	Add file deletion capability and tests	2025-04-21 22:04:08 -04:00
Alex Selimov	32007d5c36	Rework cli to accept ranges	2025-04-21 22:03:32 -04:00
Alex Selimov	77ec5ffff1	Updte to include email count in top senders list	2025-04-19 10:34:28 -04:00
Alex Selimov	88899dfbd4	Add cli, top sender calculation functionality	2025-04-19 10:25:13 -04:00