Compare commits

...

4 Commits

10 changed files with 361 additions and 45 deletions

View File

@ -3,13 +3,9 @@ name = "maildirclean"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
authors = [
{ name = "Alex Selimov", email = "alex@alexselimov.com" }
]
authors = [{ name = "Alex Selimov", email = "alex@alexselimov.com" }]
requires-python = ">=3.13"
dependencies = [
"pandas>=2.2.3",
]
dependencies = ["pandas>=2.2.3"]
[project.scripts]
maildirclean = "maildirclean:main"
@ -19,6 +15,4 @@ requires = ["hatchling"]
build-backend = "hatchling.build"
[dependency-groups]
dev = [
"pytest>=8.3.5",
]
dev = ["pytest>=8.3.5"]

View File

@ -1,2 +1,5 @@
def main() -> None:
print("Hello from maildirclean!")
from .cli import cli
def main() -> int:
return cli()

160
src/maildirclean/cli.py Normal file
View File

@ -0,0 +1,160 @@
import argparse
import sys
from pathlib import Path
from .maildir import MailDir, TopSender, parse_maildir
def parse_arguments() -> argparse.Namespace:
"""Parse command line arguments
Returns: Namespace object corresponding to parsed arguments
"""
parser = argparse.ArgumentParser(
description="Analyze email metadata from a maildir directory"
)
parser.add_argument(
"maildir", type=str, help="Path to the maildir directory to analyze"
)
parser.add_argument(
"--top",
"-t",
type=int,
default=5,
help="Number of top senders to display (default: 5)",
)
parser.add_argument(
"--verbose", "-v", action="store_true", help="Enable verbose output"
)
args = parser.parse_args()
return args
def cli():
args = parse_arguments()
maildir_path = Path(args.maildir)
if not maildir_path.exists() or not maildir_path.is_dir():
print(f"Error: {args.maildir} is not a valid directory", file=sys.stderr)
return 1
if args.verbose:
print(f"Analyzing emails in {maildir_path}...")
run_loop(args, maildir_path)
return 0
def run_loop(args: argparse.Namespace, maildir_path: str | Path):
# Set up maildir
maildir = parse_maildir(maildir_path)
if args.verbose:
print(f"Found {len(maildir._df)} emails")
# Main running loop
while True:
top_senders = maildir.get_top_n_senders(args.top)
if not top_senders:
print("No senders found in the maildir", file=sys.stderr)
return 0
result = []
for i, sender in enumerate(top_senders, 1):
names_str = ", ".join(sender.names[:5]) # Limit to first 5 names
if len(sender.names) > 5:
names_str += f" and {len(sender.names) - 5} more"
result.append(
f"{i}. {sender.email} - Email count: {sender.count} - Names used: {names_str}"
)
output = "\n".join(
[f"Top {len(top_senders)} senders in {maildir_path}:", "=" * 40, *result]
)
print(output)
if not user_input_loop(top_senders, maildir):
break
def user_input_loop(top_senders: list[TopSender], maildir: MailDir) -> bool:
user_input = input("> ").strip()
while handle_user_input(user_input, top_senders, maildir):
user_input = input("> ").strip()
def parse_selections(user_input, max_selection):
"""
Parse user input into a set of valid selections.
Args:
user_input (str): User input string with numbers, comma-separated lists, or ranges
max_selection (int): Maximum allowed selection number
Returns:
set: Set of valid selection numbers
Raises:
ValueError: If any input is invalid or out of range
"""
selections = set()
# Clean up the input
user_input = user_input.strip()
# Split by comma
items = [item.strip() for item in user_input.split(",")]
for item in items:
if "-" in item:
# Handle range (e.g., "1-3" or "4 - 5")
range_parts = [part.strip() for part in item.split("-")]
if (
len(range_parts) != 2
or not range_parts[0].isdigit()
or not range_parts[1].isdigit()
):
raise ValueError(f"Invalid range format: {item}")
start = int(range_parts[0])
end = int(range_parts[1])
if start > end:
raise ValueError(
f"Invalid range: {item}. Start must be less than or equal to end."
)
selections.update(range(start, end + 1))
elif item.isdigit():
# Handle single number
selections.add(int(item))
else:
raise ValueError(f"Invalid input: {item}")
# Check if any selection is out of range
out_of_range = [s for s in selections if s < 1 or s > max_selection]
if out_of_range:
raise ValueError(
f"Selection(s) out of range: {', '.join(map(str, out_of_range))}. Valid range is 1-{max_selection}"
)
return selections
def handle_user_input(user_input, top_senders, maildir):
if user_input.lower() == "q":
return False
try:
selections = parse_selections(user_input, len(top_senders))
for selection in selections:
selected_sender = top_senders[selection - 1]
print(f"Selected {selected_sender.email}")
return True
except ValueError:
print("Please enter a valid number or 'q' to quit")

View File

@ -0,0 +1,14 @@
"""Module containing functionality to interact with the filesystem"""
import os
from pathlib import Path
def delete_files(file_list: list[str | Path]):
"""Delete all files in the provided file list
Args:
file_list: List of file paths as either strings or Path
"""
for file in file_list:
os.remove(file)

View File

@ -1,8 +1,11 @@
from pathlib import Path
import pandas as pd
import re
from .utility import first_match_or_empty, to_datetime_safe
METADATA_SCHEMA = sorted(["Path", "From", "Date"])
METADATA_SCHEMA = sorted(["path", "from", "date"])
def make_email_metadata(email_path: str | Path) -> dict[str, str]:
@ -15,13 +18,17 @@ def make_email_metadata(email_path: str | Path) -> dict[str, str]:
Returns: Dict containing the required metadata
"""
key_is_set = {key: False for key in METADATA_SCHEMA}
metadata = {"Path": str(email_path)}
key_is_set["Path"] = True
metadata = {"path": str(email_path)}
key_is_set["path"] = True
with open(email_path, "r") as f:
with open(email_path, "rb") as f:
for line in f:
try:
k, v = [val.strip() for val in line.split(":", maxsplit=1)]
k, v = [
val.strip()
for val in line.decode(errors="ignore").split(":", maxsplit=1)
]
k = k.lower()
if k in METADATA_SCHEMA:
metadata[k] = v
key_is_set[k] = True
@ -51,11 +58,14 @@ def parse_maildir(path_to_dir: str | Path):
class TopSender:
"""Simple class to store the top sender alongside the first 5 names they used"""
"""Simple class to store the top sender,
alongisde names they used and the count of emails sent by them
"""
def __init__(self, email: str, names: list[str]):
def __init__(self, email: str, names: list[str], count: int):
self.email = email
self.names = names
self.count = count
class MailDir:
@ -63,10 +73,17 @@ class MailDir:
Stores the metadata associated with all local emails.
"""
name_regex = r"^(.*?)(?=<)"
email_regex = r"<?([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)>?"
KEYS_AND_FUNCS = {
"Name": lambda df: df["From"].map(lambda x: x.split("<")[0].strip('" ')),
"Email": lambda df: df["From"].map(lambda x: x.split("<")[1].strip(">")),
"Date": lambda df: pd.to_datetime(df["Date"]),
"name": lambda df: df["from"].map(
lambda x: first_match_or_empty(MailDir.name_regex, x).strip('" ')
),
"email": lambda df: df["from"].map(
lambda x: first_match_or_empty(MailDir.email_regex, x).strip("")
),
"date": lambda df: df["date"].map(lambda x: to_datetime_safe(x)),
}
def __init__(self, email_metadata: list[dict[str, str]]):
@ -91,11 +108,14 @@ class MailDir:
"""
unique_senders = self._df["email"].value_counts().iloc[0:n]
senders = [
TopSender(
email, list(self._df.loc[self._df["Email"] == email, "Name"].unique())
email,
list(self._df.loc[self._df["email"] == email, "name"].unique()),
count,
)
for email in self._df["Email"].value_counts().iloc[0:n].index
for (email, count) in unique_senders.items()
]
return senders

View File

@ -0,0 +1,28 @@
import re
import pandas as pd
from datetime import datetime
def first_match_or_empty(pattern: str, text: str) -> str:
"""Get the first match for the provided pattern or "" if empty.
This is a wrapper to facilitate usage of the re.search in lambda expressions
Args:
pattern: Pattern to search for
text: Text that will be searched
Returns: First pattern match or ""
"""
match = re.search(pattern, text)
if match:
return match.group(1)
else:
return ""
def to_datetime_safe(datetime_str: str):
try:
return pd.to_datetime(datetime_str, format="mixed", errors="coerce")
except (ValueError, AttributeError):
return datetime.now()

View File

@ -125,13 +125,13 @@ def sample_email_dir(tmp_dir):
@pytest.fixture
def sample_email_metadata():
return [
{"From": "John Doe <john.doe@example.com>", "Date": "2025-01-01"},
{"From": "John Doe <john.doe@example.com>", "Date": "2025-01-02"},
{"From": "Johnny Doe <john.doe@example.com>", "Date": "2025-01-03"},
{"From": "J. Doe <john.doe@example.com>", "Date": "2025-01-04"},
{"From": "Jane Smith <jane.smith@example.com>", "Date": "2025-01-05"},
{"From": "Jane S. <jane.smith@example.com>", "Date": "2025-01-06"},
{"From": "Alex Johnson <alex.johnson@example.com>", "Date": "2025-01-07"},
{"From": "Alex J. <alex.johnson@example.com>", "Date": "2025-01-08"},
{"From": "Sarah Williams <sarah@example.com>", "Date": "2025-01-09"},
{"from": "John Doe <john.doe@example.com>", "date": "2025-01-01"},
{"from": "John Doe <john.doe@example.com>", "date": "2025-01-02"},
{"from": "Johnny Doe <john.doe@example.com>", "date": "2025-01-03"},
{"from": "J. Doe <john.doe@example.com>", "date": "2025-01-04"},
{"from": "Jane Smith <jane.smith@example.com>", "date": "2025-01-05"},
{"from": "Jane S. <jane.smith@example.com>", "date": "2025-01-06"},
{"from": "Alex Johnson <alex.johnson@example.com>", "date": "2025-01-07"},
{"from": "Alex J. <alex.johnson@example.com>", "date": "2025-01-08"},
{"from": "Sarah Williams <sarah@example.com>", "date": "2025-01-09"},
]

87
tests/test_cli.py Normal file
View File

@ -0,0 +1,87 @@
import pytest
from pathlib import Path
# Import the function to test - assuming it's in a module called 'maildir_analyzer'
# Update this import to match your actual module structure
from maildirclean.cli import parse_selections
def test_single_number():
result = parse_selections("3", 5)
assert result == {3}
def test_comma_separated_list():
result = parse_selections("1,3,5", 5)
assert result == {1, 3, 5}
def test_range():
result = parse_selections("2-4", 5)
assert result == {2, 3, 4}
def test_range_with_spaces():
result = parse_selections("1 - 3", 5)
assert result == {1, 2, 3}
def test_combined_input():
result = parse_selections("1,3-5,7", 10)
assert result == {1, 3, 4, 5, 7}
def test_duplicate_numbers():
result = parse_selections("1,1,2,2-4,3", 5)
assert result == {1, 2, 3, 4} # Set removes duplicates
def test_invalid_format():
with pytest.raises(ValueError, match="Invalid input: abc"):
parse_selections("abc", 5)
def test_invalid_range_format():
with pytest.raises(ValueError, match="Invalid range format: 2-a"):
parse_selections("2-a", 5)
def test_inverted_range():
with pytest.raises(ValueError, match="Invalid range: 5-2"):
parse_selections("5-2", 5)
def test_out_of_range():
with pytest.raises(ValueError, match="Selection.*out of range"):
parse_selections("3,6,8", 5)
def test_mix_valid_and_invalid():
with pytest.raises(ValueError):
parse_selections("1,abc,3", 5)
def test_empty_input():
with pytest.raises(ValueError):
parse_selections("", 5)
def test_whitespace_only():
with pytest.raises(ValueError):
parse_selections(" ", 5)
def test_max_boundary():
# Test the boundary case
result = parse_selections("5", 5)
assert result == {5}
def test_complex_input():
result = parse_selections("1-2, 4, 6-8", 10)
assert result == {1, 2, 4, 6, 7, 8}
def test_negative_numbers():
with pytest.raises(ValueError, match="Invalid range format:*"):
parse_selections("-1,2", 5)

10
tests/test_filedir.py Normal file
View File

@ -0,0 +1,10 @@
from fixtures import *
from maildirclean.filedir import delete_files
def test_delete_files(sample_email_dir):
file_list = list(Path(sample_email_dir).glob("*"))
delete_files(file_list)
for file in file_list:
assert not Path(file).is_file()

View File

@ -8,20 +8,20 @@ def test_email_parsing(test_email):
metadata = make_email_metadata(test_email)
assert metadata["From"] == '"John Doe" <sender@example.com>'
assert metadata["Date"] == "Wed, 16 Apr 2025 12:23:35 -0400"
assert metadata["Path"] == str(test_email)
assert metadata["from"] == '"John Doe" <sender@example.com>'
assert metadata["date"] == "Wed, 16 Apr 2025 12:23:35 -0400"
assert metadata["path"] == str(test_email)
def test_maildir_creation(test_email):
maildir = MailDir([make_email_metadata(test_email)])
metadata = maildir._df.iloc[0]
assert metadata["From"] == '"John Doe" <sender@example.com>'
assert metadata["Name"] == "John Doe"
assert metadata["Email"] == "sender@example.com"
assert metadata["Date"] == pd.to_datetime("Wed, 16 Apr 2025 12:23:35 -0400")
assert metadata["Path"] == str(test_email)
assert metadata["from"] == '"John Doe" <sender@example.com>'
assert metadata["name"] == "John Doe"
assert metadata["email"] == "sender@example.com"
assert metadata["date"] == pd.to_datetime("Wed, 16 Apr 2025 12:23:35 -0400")
assert metadata["path"] == str(test_email)
def test_get_top_n_senders(sample_email_metadata):
@ -94,9 +94,9 @@ def test_parse_maildir(sample_email_dir):
maildir = parse_maildir(sample_email_dir)
assert len(maildir._df) == 3
assert "test@something.org" in list(maildir._df["Email"])
assert "not_a_test@something.org" in list(maildir._df["Email"])
assert "test@something.org" in list(maildir._df["email"])
assert "not_a_test@something.org" in list(maildir._df["email"])
assert "Test" in list(maildir._df["Name"])
assert "Not a Test" in list(maildir._df["Name"])
assert "Test2" in list(maildir._df["Name"])
assert "Test" in list(maildir._df["name"])
assert "Not a Test" in list(maildir._df["name"])
assert "Test2" in list(maildir._df["name"])