diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..5a5c554
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,39 @@
+FROM docker.io/python:3.13.11-slim-bookworm
+
+ARG APP_HOME=/app
+
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+
+RUN apt-get update && apt-get install --no-install-recommends -y \
+ build-essential \
+ # ssdeep / fuzzy hashing dependencies.
+ libfuzzy-dev \
+ # actual ssdeep binary incase you want a CLI option
+ ssdeep
+
+WORKDIR ${APP_HOME}
+
+RUN pip install --upgrade \
+ pip \
+ "setuptools<70" \
+ wheel
+
+# Set constraint to ensure ssdeep build uses compatible setuptools.
+ENV PIP_CONSTRAINT=/tmp/constraints.txt
+RUN echo "setuptools<70" > /tmp/constraints.txt
+
+RUN pip install ssdeep==3.4
+
+COPY ./compare_ssdeep_ppdeep.py .
+
+# Pick a ppdeep version...
+
+# 1) Current 20251115 ppdeep version
+RUN pip install ppdeep==20251115
+
+# 2) Updated ppdeep PR
+# COPY ./ppdeep.py .
+# COPY ./setup.py .
+# COPY ./README.md .
+# RUN python setup.py install
diff --git a/compare_ssdeep_ppdeep.py b/compare_ssdeep_ppdeep.py
new file mode 100644
index 0000000..266dc1d
--- /dev/null
+++ b/compare_ssdeep_ppdeep.py
@@ -0,0 +1,521 @@
+#!/usr/bin/env python
+"""
+Compare ssdeep and ppdeep libraries to ensure they produce identical results.
+Tests context triggered piecewise hashes (CTPH) / fuzzy hashes on both string and bytes objects.
+"""
+
+import argparse
+import os
+import sys
+
+
+def generate_test_data():
+ """Generate 70 test objects: 35 strings and 35 bytes."""
+ test_objects = []
+
+ # 35 string test cases (25 regular + 5 hex escape + 5 unicode escape)
+ string_tests = [
+ "Hello, World!",
+ "The quick brown fox jumps over the lazy dog",
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit",
+ "a" * 100,
+ "b" * 500,
+ "Test with special chars: !@#$%^&*()_+-={}[]|:;<>?,./",
+ "Multiline\nstring\nwith\nnewlines",
+ "Tab\tseparated\tvalues",
+ "Unicode test: café, naïve, 日本語",
+ "Email: test@example.com, URL: https://example.com",
+ 'JSON-like: {"key": "value", "number": 123}',
+ 'XML-like: content',
+ "Base64-like: SGVsbG8gV29ybGQh",
+ "Hex-like: 48656c6c6f20576f726c6421",
+ "Long repeated pattern: " + "pattern" * 100,
+ "Mixed case: AbCdEfGhIjKlMnOpQrStUvWxYz",
+ "Numbers only: 1234567890" * 10,
+ "Whitespace variations: spaces tabs\ttabs ",
+ "Path-like: /usr/local/bin/python3.13",
+ "Windows path: C:\\Users\\Admin\\Documents\\file.txt",
+ "SQL-like: SELECT * FROM users WHERE id = 1",
+ "HTML:
Title
",
+ "Empty string",
+ "Single char: x",
+ "Very long string: " + "x" * 10000,
+ # Hex escape sequences
+ "\x48\x65\x6c\x6c\x6f", # "Hello" in hex
+ "\x00\x01\x02\x03\x04", # Control characters
+ "Null byte test: \x00 middle",
+ "\xff\xfe\xfd", # High byte values
+ "Mixed: \x41\x42\x43 ABC", # Hex + regular
+ # Unicode escape sequences
+ "\u00e9\u00e0\u00fc", # é à ü
+ "\u4e2d\u6587", # 中文 (Chinese)
+ "Emoji: \U0001f600\U0001f44d", # 😀👍
+ "\u03b1\u03b2\u03b3", # αβγ (Greek)
+ "Mixed: \u2665 hearts \u2660 spades", # ♥ ♠
+ ]
+
+ for i, s in enumerate(string_tests):
+ test_objects.append(
+ {
+ "type": "string",
+ "id": f"str_{i + 1}",
+ "data": s,
+ }
+ )
+
+ # Bytes test cases. Some are byte literals, some are UTF-8 encoded strings, and some are binary data.
+ bytes_tests = [
+ b"Hello, World!",
+ b"The quick brown fox jumps over the lazy dog",
+ b"Lorem ipsum dolor sit amet, consectetur adipiscing elit",
+ b"a" * 100,
+ b"b" * 500,
+ b"Test with special chars: !@#$%^&*()_+-={}[]|:;<>?,./",
+ b"Multiline\nstring\nwith\nnewlines",
+ b"Tab\tseparated\tvalues",
+ b"Email: test@example.com, URL: https://example.com",
+ b'JSON-like: {"key": "value", "number": 123}',
+ b'XML-like: content',
+ b"Base64-like: SGVsbG8gV29ybGQh",
+ b"Hex-like: 48656c6c6f20576f726c6421",
+ b"Long repeated pattern: " + b"pattern" * 100,
+ b"Mixed case: AbCdEfGhIjKlMnOpQrStUvWxYz",
+ b"Numbers only: 1234567890" * 10,
+ b"Whitespace variations: spaces tabs\ttabs ",
+ b"Path-like: /usr/local/bin/python3.13",
+ b"Windows path: C:\\Users\\Admin\\Documents\\file.txt",
+ b"SQL-like: SELECT * FROM users WHERE id = 1",
+ b"HTML: Title
",
+ b"Empty bytes",
+ b"Single char: x",
+ b"Very long bytes: " + b"x" * 10000,
+ bytes(range(256)), # All possible byte values
+ # Hex escape sequences
+ b"\x48\x65\x6c\x6c\x6f", # b"Hello" in hex
+ b"\x00\x01\x02\x03\x04", # Control characters
+ b"Null byte test: \x00 middle",
+ b"\xff\xfe\xfd", # High byte values
+ b"Mixed: \x41\x42\x43 ABC", # Hex + regular
+ # Unicode escape sequences (as UTF-8 encoded bytes)
+ "\u00e9\u00e0\u00fc".encode("utf-8"), # é à ü
+ "\u4e2d\u6587".encode("utf-8"), # 中文 (Chinese)
+ "Emoji: \U0001f600\U0001f44d".encode("utf-8"), # 😀👍
+ "\u03b1\u03b2\u03b3".encode("utf-8"), # αβγ (Greek)
+ "Mixed: \u2665 hearts \u2660 spades".encode("utf-8"), # ♥ ♠
+ # 32
+ b"\xc1C*\xa3 \xb3D@\xe4\x08\xab\xbc\x94\xc0W\x8d\x9e\xbc}\\{\x8d*\x07\x9f\xf9\xc8\x04\t\xba2\xa9",
+ b"\xa6`\x02\xda\x9aB\xf1Up\x1f\x876Ay\x07\xf7}\x10\xd7\xb7\xfa\x8fWs\x9d\\}X\xff\xe2\x9c\x8e",
+ b"\xa3\xcf\x99\xdd[\x9a?e\x0f\xbf]\xdd\x9e\xcb.\x17V`3\xbf\xed&T\xa6\xecN\x10\xfd\xc5\xda8\x1d",
+ b'\\\x07\xc4O\xf05\xb9\x19Z\xb9\xdb\x9a\xd5\xed\x93\x9d\xc7`\xab\xb6\xa8\x99\xc4\x98"\xde\xde9\xfdb<\x9b',
+ b"\x98\x11\x8d]\x93\x82\xaaEx~<}\\\x1a\xf9!\xae\xcc\x8cn-E\xe9\xa8\xe3\x0f\x0f\xa6\xa1\xdc\xd2\xe9\xa1",
+ # 33
+ b"\x98\x11\x8d]\x93\x82\xaaEx~<}\\\x1a\xf9!\xae\xcc\x8cn-E\xe9\xa8\xe3\x0f\x0f\xa6\xa1\xdc\xd2\xe9\xa1",
+ # 64
+ b"z\x06vf\xbb\xf9J*|4\\\xdd\x17\xd7\x8f9\xb3\x9a\r\xd2\xa2\xf0\xe3\x0f\xe4\xb5\\|\x7f\x1cq\xd0\x01\xaf\x86\x8b\xd1~\xf8*-\xf7\x12\xb5):Q\xa5z\xdc\xcb\x0bv|\x06c\xf0\xd2s\x18\rb\xd8\xed"
+ # 65
+ b"\x14\xbc\x91V2K\x8a\xce>\xdb\xf1\xe8\x1e\xef\xc0F\xaf\xb6\xd6(\xd2\xda\xd4#\xf6\x7fl\n\x7fT`-m\xd7\x1c;\x90X\x91\x80\x88\x99\xb6-h-\xd9\xdcx\xfb\xa6Tn\x87Pw\xfa\x9e:\x00*\\g\x1f\x80"
+ b"\x00\x7f\xc5@\xc7\x18\x04\x995\x03\x9a\x0e\x8e\xb39\x13\x17\xb1SQ\xe2\xab\xb9\xe0D\x86,\x11\x9d7\xb1\xa2<\x95\x8b\x9e\xb8\xfe;\x9c\xca\xd3\x82'\x91\xe8\xd8f\xe6+\x9f\x12w\x16S\xbah\xa9\xee\xbd!\xc4+\xa9\xfe",
+ # 127
+ b'\xfa\xe8p\x08\x8b)T[\xc0\xeaS\x05-\xea\xa1\xed\x85V\xe0\xee\xab\xef\x17\x16(k\x14\rZB)\xbe\xf5!"\xa3R\xb2\x9a\x0c\xd0\xbb\xa5\x81\xcbq\x9eP_L\xc4\x9aP\xdf\x1a\xbcz\xb9\xb1\xa1\x07\x9eC\x12\xb1\xe6{\xf8\x18\x02\xf0B\xe1s\xbf\xb7\x9c\xf8e\\\x11_-\xef2o\xea\x8c<\x05\t\x10\xbdI=(\xf8\n\xffa\x8d\xc4\xd7\x11N\xe3\xf2\xd5\x9bQ#\x94\xe5\xf9\xc7\x1a\xda\xbeR{\xe9\xcf@\xf8\tZM'
+ # 128
+ b"cal\xbc\xaa\xfb\xc3@\x9a\x9euCi\xaf\xc5\xd8$\x8a\xe5\xabE\x85D\xd3\x161i)\xe5\xd4Uj\xdd\xf6\xe6\x08\x1e\xeb\xa8\x8eLd\x12\x81\xdd\xbbF\xc4\xc1\x17\xfd\xda\xb4W\xad_\x90\xadB\x140\xbdFI\xbeL\x9e\xc2\xc6\x03z-t\xbf\x84\xf33\xcd\xaa\x1ds0L\x1c\xaa\x16o\x1d\x078\xa8\x9ez\xa4\xb2\xe3on\xd5*\xbb\x9e?\x1dvf\xc8\xa0\xceHl\xd1\x1b_{\xe7\xdc\x19\x0c2)\r\xed\xa3\xf3\x13aw",
+ ]
+
+ for i, b in enumerate(bytes_tests):
+ test_objects.append(
+ {
+ "type": "bytes",
+ "id": f"bytes_{i + 1}",
+ "data": b,
+ }
+ )
+
+ # Add file tests
+ file_paths = [
+ "/usr/bin/setsid",
+ "/usr/bin/locale",
+ "/usr/bin/last",
+ "/usr/bin/perl",
+ # Add more file paths here as needed
+ ]
+
+ for idx, file_path in enumerate(file_paths, start=1):
+ try:
+ with open(file_path, "rb") as f:
+ file_data = f.read()
+ test_objects.append(
+ {
+ "type": "file",
+ "id": f"file_{idx}",
+ "data": file_data,
+ }
+ )
+ print(f"Added file test {idx}: {file_path} ({len(file_data)} bytes)")
+ except FileNotFoundError:
+ print(f"Warning: File '{file_path}' not found, skipping")
+ except Exception as e:
+ print(f"Warning: Could not read file '{file_path}': {e}")
+
+ return test_objects
+
+
+def compare_libraries():
+ """Compare ssdeep and ppdeep hash results."""
+
+ # Try importing both libraries
+ try:
+ import ssdeep
+
+ has_ssdeep = True
+ except ImportError as e:
+ print(f"Warning: ssdeep not available: {e}")
+ has_ssdeep = False
+
+ try:
+ import ppdeep
+
+ has_ppdeep = True
+ except ImportError as e:
+ print(f"Warning: ppdeep not available: {e}")
+ has_ppdeep = False
+
+ if not has_ssdeep and not has_ppdeep:
+ print("Error: Neither ssdeep nor ppdeep is installed!")
+ sys.exit(1)
+
+ if not has_ssdeep:
+ print("Warning: Only ppdeep is available. Cannot compare.")
+ return
+
+ if not has_ppdeep:
+ print("Warning: Only ssdeep is available. Cannot compare.")
+ return
+
+ print("=" * 80)
+ print("SSDEEP vs PPDEEP COMPARISON")
+ print("=" * 80)
+ print(f"ssdeep version: {ssdeep.__version__ if hasattr(ssdeep, '__version__') else 'unknown'}")
+ print(f"ppdeep version: {ppdeep.__version__ if hasattr(ppdeep, '__version__') else 'unknown'}")
+ print()
+
+ # Generate test data
+ test_objects = generate_test_data()
+ string_count = len([t for t in test_objects if t["type"] == "string"])
+ bytes_count = len([t for t in test_objects if t["type"] == "bytes"])
+ file_count = len([t for t in test_objects if t["type"] == "file"])
+ print(f"Testing {len(test_objects)} objects ({string_count} strings, {bytes_count} bytes, {file_count} files)\n")
+
+ # Track results
+ total_tests = 0
+ matching = 0
+ mismatches = []
+ ssdeep_errors = []
+ ppdeep_errors = []
+
+ # Test each object
+ for obj in test_objects:
+ total_tests += 1
+ obj_id = obj["id"]
+ obj_type = obj["type"]
+ data = obj["data"]
+
+ # Get ssdeep hash
+ try:
+ if obj_type == "string":
+ ssdeep_hash = ssdeep.hash(data)
+ else:
+ ssdeep_hash = ssdeep.hash(data)
+ except Exception as e:
+ ssdeep_hash = None
+ ssdeep_errors.append({"id": obj_id, "error": str(e)})
+
+ # Get ppdeep hash
+ try:
+ if obj_type == "string":
+ ppdeep_hash = ppdeep.hash(data)
+ else:
+ ppdeep_hash = ppdeep.hash(data)
+ except Exception as e:
+ ppdeep_hash = None
+ ppdeep_errors.append({"id": obj_id, "error": str(e)})
+
+ # Compare results
+ if ssdeep_hash is not None and ppdeep_hash is not None:
+ if ssdeep_hash == ppdeep_hash:
+ matching += 1
+ print(f"✓ {obj_id:15} MATCH")
+ else:
+ mismatches.append(
+ {
+ "id": obj_id,
+ "type": obj_type,
+ "ssdeep": ssdeep_hash,
+ "ppdeep": ppdeep_hash,
+ }
+ )
+ print(f"✗ {obj_id:15} MISMATCH")
+ elif ssdeep_hash is None and ppdeep_hash is None:
+ print(f"⚠ {obj_id:15} BOTH FAILED")
+ elif ssdeep_hash is None:
+ print(f"⚠ {obj_id:15} SSDEEP FAILED")
+ else:
+ print(f"⚠ {obj_id:15} PPDEEP FAILED")
+
+ # Print summary
+ print("\n" + "=" * 80)
+ print("SUMMARY")
+ print("=" * 80)
+ print(f"Total tests: {total_tests}")
+ print(f"Matching hashes: {matching} ({matching / total_tests * 100:.1f}%)")
+ print(f"Mismatches: {len(mismatches)}")
+ print(f"ssdeep errors: {len(ssdeep_errors)}")
+ print(f"ppdeep errors: {len(ppdeep_errors)}")
+
+ # Print detailed mismatch information
+ if mismatches:
+ print("\n" + "=" * 80)
+ print("MISMATCHES DETAIL")
+ print("=" * 80)
+ for mismatch in mismatches:
+ print(f"\nID: {mismatch['id']} ({mismatch['type']})")
+ print(f" ssdeep: {mismatch['ssdeep']}")
+ print(f" ppdeep: {mismatch['ppdeep']}")
+
+ # Print error details
+ if ssdeep_errors:
+ print("\n" + "=" * 80)
+ print("SSDEEP ERRORS")
+ print("=" * 80)
+ for error in ssdeep_errors:
+ print(f"{error['id']}: {error['error']}")
+
+ if ppdeep_errors:
+ print("\n" + "=" * 80)
+ print("PPDEEP ERRORS")
+ print("=" * 80)
+ for error in ppdeep_errors:
+ print(f"{error['id']}: {error['error']}")
+
+ # Final verdict
+ print("\n" + "=" * 80)
+ if matching == total_tests:
+ print("✓ RESULT: All hashes match! Libraries are compatible.")
+ print("=" * 80)
+ sys.exit(0)
+ else:
+ print("✗ RESULT: Differences detected! Review mismatches before swapping libraries.")
+ print("=" * 80)
+ sys.exit(1)
+
+
+def find_mismatch_file(start_path="/"):
+ """
+ Recursively search filesystem for a file where ssdeep and ppdeep produce different hashes.
+
+ Args:
+ start_path: Directory to start searching from (default: "/")
+ """
+ try:
+ import ssdeep
+ except ImportError:
+ print("Error: ssdeep not installed")
+ sys.exit(1)
+
+ try:
+ import ppdeep
+ except ImportError:
+ print("Error: ppdeep not installed")
+ sys.exit(1)
+
+ print(f"Searching for hash mismatch starting from: {start_path}")
+ print("Press Ctrl+C to stop\n")
+
+ files_checked = 0
+ errors_skipped = 0
+
+ for root, dirs, files in os.walk(start_path):
+ # Skip common system/virtual directories
+ dirs[:] = [
+ d
+ for d in dirs
+ if d
+ not in [
+ ".git",
+ "node_modules",
+ "__pycache__",
+ ".venv",
+ "venv",
+ "Library",
+ "Applications",
+ "System",
+ "Volumes",
+ "dev",
+ "proc",
+ "sys",
+ ]
+ ]
+
+ for filename in files:
+ filepath = os.path.join(root, filename)
+
+ # Skip symlinks and non-regular files
+ try:
+ if not os.path.isfile(filepath) or os.path.islink(filepath):
+ continue
+ except (OSError, PermissionError):
+ continue
+
+ files_checked += 1
+ if files_checked % 100 == 0:
+ print(f"Checked {files_checked} files...", end="\r")
+
+ try:
+ ssdeep_hash = ssdeep.hash_from_file(filepath)
+ ppdeep_hash = ppdeep.hash_from_file(filepath)
+
+ if ssdeep_hash != ppdeep_hash:
+ print(f"\n\n{'=' * 80}")
+ print("MISMATCH FOUND!")
+ print(f"{'=' * 80}")
+ print(f"File: {filepath}")
+ print(f"Size: {os.path.getsize(filepath)} bytes")
+ print(f"ssdeep: {ssdeep_hash}")
+ print(f"ppdeep: {ppdeep_hash}")
+ print(f"{'=' * 80}")
+ print(f"Total files checked: {files_checked}")
+ # return filepath
+
+ except (PermissionError, OSError, IOError, Exception):
+ errors_skipped += 1
+ continue
+
+ print("\n\nSearch complete. No mismatches found.")
+ print(f"Files checked: {files_checked}")
+ print(f"Errors skipped: {errors_skipped}")
+ return None
+
+
+def find_random_mismatch(num_tests=10000, length=32):
+ """
+ Generate random byte strings and test for hash mismatches.
+
+ Args:
+ num_tests: Number of random strings to generate (default: 10000)
+ length: Length of each random byte string (default: 32)
+ """
+ import random
+
+ try:
+ import ssdeep
+ except ImportError:
+ print("Error: ssdeep not installed")
+ sys.exit(1)
+
+ try:
+ import ppdeep
+ except ImportError:
+ print("Error: ppdeep not installed")
+ sys.exit(1)
+
+ print(f"Generating {num_tests} random byte strings of length {length}")
+ print("Press Ctrl+C to stop\n")
+
+ tests_run = 0
+ matches = 0
+ mismatches_found = []
+
+ for i in range(num_tests):
+ tests_run += 1
+ if tests_run % 100 == 0:
+ print(f"Tested {tests_run}/{num_tests} random strings...", end="\r")
+
+ # Generate random bytes
+ random_bytes = bytes(random.randint(0, 255) for _ in range(length))
+
+ try:
+ ssdeep_hash = ssdeep.hash(random_bytes)
+ ppdeep_hash = ppdeep.hash(random_bytes)
+
+ if ssdeep_hash == ppdeep_hash:
+ matches += 1
+ else:
+ mismatches_found.append(
+ {
+ "test_num": tests_run,
+ "data": random_bytes,
+ "ssdeep": ssdeep_hash,
+ "ppdeep": ppdeep_hash,
+ }
+ )
+ print(f"\n\n{'=' * 80}")
+ print("MISMATCH FOUND!")
+ print(f"{'=' * 80}")
+ print(f"Test number: {tests_run}")
+ print(f"Random bytes (hex): {random_bytes.hex()}")
+ print(f"Random bytes (repr): {random_bytes!r}")
+ print(f"Length: {len(random_bytes)}")
+ print(f"ssdeep: {ssdeep_hash}")
+ print(f"ppdeep: {ppdeep_hash}")
+ print(f"{'=' * 80}")
+ # Don't return immediately, continue testing to find all mismatches
+
+ except Exception as e:
+ print(f"\nError testing random bytes at iteration {tests_run}: {e}")
+ continue
+
+ # Print summary
+ print("\n\nRandom testing complete.")
+ print(f"Tests run: {tests_run}")
+ print(f"Matches: {matches}")
+ print(f"Mismatches: {len(mismatches_found)}")
+
+ if len(mismatches_found) == 0:
+ print("\n✓ No mismatches found! Libraries appear compatible.")
+ else:
+ print(f"\n✗ Found {len(mismatches_found)} mismatch(es)!")
+ print("\nAll mismatches:")
+ for idx, mismatch in enumerate(mismatches_found, 1):
+ print(f"\n {idx}. Test #{mismatch['test_num']}: {mismatch['data'].hex()[:60]}...")
+
+ return mismatches_found
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Compare ssdeep and ppdeep library hash outputs")
+ parser.add_argument(
+ "--find-mismatch",
+ action="store_true",
+ help="Recursively search filesystem for a file with mismatched hashes",
+ )
+ parser.add_argument(
+ "--start-path",
+ type=str,
+ default="/",
+ help="Starting directory for mismatch search (default: /)",
+ )
+ parser.add_argument(
+ "--random-test",
+ action="store_true",
+ help="Generate random byte strings to find hash mismatches",
+ )
+ parser.add_argument(
+ "--num-tests",
+ type=int,
+ default=10000,
+ help="Number of random tests to run (default: 10000)",
+ )
+ parser.add_argument(
+ "--length",
+ type=int,
+ default=32,
+ help="Length of random byte strings (default: 32)",
+ )
+
+ args = parser.parse_args()
+
+ if args.find_mismatch:
+ find_mismatch_file(args.start_path)
+ elif args.random_test:
+ find_random_mismatch(args.num_tests, args.length)
+ else:
+ compare_libraries()
diff --git a/ppdeep.py b/ppdeep.py
index 4ddd77e..a037783 100755
--- a/ppdeep.py
+++ b/ppdeep.py
@@ -28,7 +28,7 @@
'''
__title__ = 'ppdeep'
-__version__ = '20251115'
+__version__ = '20260209'
__author__ = 'Marcin Ulikowski'
import os
@@ -78,6 +78,8 @@ def _spamsum(stream, slen):
block_hash1 = block_hash2 = int(HASH_INIT)
hash_string1 = hash_string2 = str()
+ # Track the last character stored at each reset point (for rh==0 case at end)
+ last_char1 = last_char2 = str()
stream.seek(0)
buf = stream.read(STREAM_BUFF_SIZE)
@@ -88,21 +90,30 @@ def _spamsum(stream, slen):
block_hash2 = sum_table[block_hash2][c]
roll_n = next(roll_c)
- roll_h2 = roll_h2 - roll_h1 + (ROLL_WINDOW * b)
- roll_h1 = roll_h1 + b - roll_win[roll_n]
+ # Must use 32-bit unsigned arithmetic to match C's uint32_t behavior
+ # In C, subtraction that goes negative wraps to large positive values
+ roll_h2 = (roll_h2 - roll_h1 + (ROLL_WINDOW * b)) & 0xFFFFFFFF
+ roll_h1 = (roll_h1 + b - roll_win[roll_n]) & 0xFFFFFFFF
roll_win[roll_n] = b
- roll_h3 = (roll_h3 << 5) & 0xFFFFFFFF
- roll_h3 ^= b
+ roll_h3 = ((roll_h3 << 5) ^ b) & 0xFFFFFFFF
- rh = roll_h1 + roll_h2 + roll_h3
+ rh = (roll_h1 + roll_h2 + roll_h3) & 0xFFFFFFFF
if (rh % block_size) == (block_size - 1):
+ # Always store the character (C stores to digest[dindex])
+ last_char1 = B64[block_hash1]
if len(hash_string1) < (SPAMSUM_LENGTH - 1):
- hash_string1 += B64[block_hash1]
+ hash_string1 += last_char1
+ last_char1 = str() # Clear after appending
block_hash1 = HASH_INIT
+ # Only track halfdigest while dindex < SPAMSUM_LENGTH/2
+ if len(hash_string1) < (SPAMSUM_LENGTH // 2):
+ last_char2 = str() # Clear like C's halfdigest = '\0'
if (rh % (block_size * 2)) == ((block_size * 2) - 1):
+ last_char2 = B64[block_hash2]
if len(hash_string2) < ((SPAMSUM_LENGTH // 2) - 1):
- hash_string2 += B64[block_hash2]
+ hash_string2 += last_char2
+ last_char2 = str() # Clear after appending
block_hash2 = HASH_INIT
buf = stream.read(STREAM_BUFF_SIZE)
@@ -110,9 +121,17 @@ def _spamsum(stream, slen):
if block_size > BLOCKSIZE_MIN and len(hash_string1) < (SPAMSUM_LENGTH // 2):
block_size = (block_size // 2)
else:
+ # Append final character - two paths like C code:
+ # 1. If rh != 0: use current hash value
+ # 2. If rh == 0 but we have a stored char: use that
if rh != 0:
hash_string1 += B64[block_hash1]
hash_string2 += B64[block_hash2]
+ else:
+ if last_char1:
+ hash_string1 += last_char1
+ if last_char2:
+ hash_string2 += last_char2
break
return '{0}:{1}:{2}'.format(block_size, hash_string1, hash_string2)