claudetool/onstart: add comprehensive tests for non-ASCII filename handling Add test cases validating AnalyzeCodebase() correctly processes files with Unicode characters in filenames, ensuring proper categorization and analysis. Problem Analysis: The AnalyzeCodebase function uses git ls-files to enumerate repository files and categorize them by type. While the implementation should theoretically handle Unicode filenames, there were no existing tests to verify this behavior with international characters, emojis, combining characters, or right-to-left scripts. Implementation Changes: 1. Test Data Creation: - Created testdata directory with files containing non-ASCII characters - Included Chinese (\u6d4b\u8bd5\u6587\u4ef6.go), French (café.js), Russian (\u0440\u0443\u0441\u0441\u043a\u0438\u0439.py) - Added emoji (\ud83d\ude80rocket.md), German umlauts (\u00dcbung.html) - Included Japanese (Makefile-\u65e5\u672c\u8a9e), Spanish (readme-español.md) - Added Korean guidance file (subdir/claude.\ud55c\uad6d\uc5b4.md) in subdirectory 2. Comprehensive Test Cases: - TestAnalyzeCodebase validates file counting and extension tracking - Verifies proper categorization of build, documentation, and guidance files - Tests git ls-files integration with Unicode filenames - Confirms extension counting works with non-ASCII characters 3. Edge Case Testing: - Added combining characters test (file\u0301\u0302.go) - Arabic right-to-left script test (\u0645\u0631\u062d\u0628\u0627.py) - Mixed Unicode with emoji test (test\u4e2d\u6587\ud83d\ude80.txt) - Validates categorizeFile function handles Unicode paths correctly 4. File Categorization Validation: - Japanese Makefile correctly identified as build file - Spanish README properly categorized as documentation - Korean Claude file in subdirectory marked as guidance file - Extension counting accurate across all Unicode filenames Technical Details: - Uses git ls-files -z for null-separated output handling Unicode safely - Test files represent major Unicode blocks: CJK, Latin Extended, Cyrillic - Proper handling of combining characters and emoji sequences - Validates both filename parsing and categorization logic paths Benefits: - Ensures international users can use non-ASCII filenames - Validates Unicode safety in codebase analysis pipeline - Prevents regressions in Unicode filename handling - Comprehensive coverage of real-world filename scenarios Testing: - All tests pass with current implementation - Verified git ls-files correctly enumerates Unicode filenames - Confirmed extension extraction works with international characters - Validated categorization logic handles Unicode paths properly This test suite ensures AnalyzeCodebase robustly handles international codebases with diverse filename conventions and character encodings. Co-Authored-By: sketch <hello@sketch.dev> Change-ID: s2431e70f6f23ec83k

commit: f1e517d64fe4a726552f5d240a1ecb3d418f16b6 [log] [tgz]
author: Marc-Antoine Ruel <maruel@gmail.com> Sun Jun 08 17:30:37 2025 +0000
committer: Philip Zeyliger <philip.zeyliger@gmail.com> Sun Jun 08 13:08:59 2025 -0700
tree: ace5b25920201d79054a8b3f7cc17cf884fae8c5
parent: de19aca257ab21956f2fba828d9265ef218687da [diff]
diff --git a/claudetool/onstart/analyze_test.go b/claudetool/onstart/analyze_test.go
new file mode 100644
index 0000000..be70ce7
--- /dev/null
+++ b/claudetool/onstart/analyze_test.go

@@ -0,0 +1,157 @@
+package onstart
+
+import (
+	"context"
+	"slices"
+	"testing"
+)
+
+func TestAnalyzeCodebase(t *testing.T) {
+	t.Run("Basic Analysis", func(t *testing.T) {
+		// Test basic functionality with regular ASCII filenames
+		codebase, err := AnalyzeCodebase(context.Background(), ".")
+		if err != nil {
+			t.Fatalf("AnalyzeCodebase failed: %v", err)
+		}
+
+		if codebase == nil {
+			t.Fatal("Expected non-nil codebase")
+		}
+
+		if codebase.TotalFiles == 0 {
+			t.Error("Expected some files to be analyzed")
+		}
+
+		if len(codebase.ExtensionCounts) == 0 {
+			t.Error("Expected extension counts to be populated")
+		}
+	})
+
+	t.Run("Non-ASCII Filenames", func(t *testing.T) {
+		// Test with non-ASCII characters in filenames
+		testdataPath := "./testdata"
+		codebase, err := AnalyzeCodebase(context.Background(), testdataPath)
+		if err != nil {
+			t.Fatalf("AnalyzeCodebase failed with non-ASCII filenames: %v", err)
+		}
+
+		if codebase == nil {
+			t.Fatal("Expected non-nil codebase")
+		}
+
+		// We expect 8 files in our testdata directory
+		expectedFiles := 8
+		if codebase.TotalFiles != expectedFiles {
+			t.Errorf("Expected %d files, got %d", expectedFiles, codebase.TotalFiles)
+		}
+
+		// Verify extension counts include our non-ASCII files
+		expectedExtensions := map[string]int{
+			".go":            1, // 测试文件.go
+			".js":            1, // café.js
+			".py":            1, // русский.py
+			".md":            3, // 🚀rocket.md, readme-español.md, claude-한국어.md
+			".html":          1, // Übung.html
+			"<no-extension>": 1, // Makefile-日本語
+		}
+
+		for ext, expectedCount := range expectedExtensions {
+			actualCount, exists := codebase.ExtensionCounts[ext]
+			if !exists {
+				t.Errorf("Expected extension %s to be found", ext)
+				continue
+			}
+			if actualCount != expectedCount {
+				t.Errorf("Expected %d files with extension %s, got %d", expectedCount, ext, actualCount)
+			}
+		}
+
+		// Verify file categorization works with non-ASCII filenames
+		// Check build files
+		if !slices.Contains(codebase.BuildFiles, "Makefile-日本語") {
+			t.Error("Expected Makefile-日本語 to be categorized as a build file")
+		}
+
+		// Check documentation files
+		if !slices.Contains(codebase.DocumentationFiles, "readme-español.md") {
+			t.Error("Expected readme-español.md to be categorized as a documentation file")
+		}
+
+		// Check guidance files
+		if !slices.Contains(codebase.GuidanceFiles, "subdir/claude.한국어.md") {
+			t.Error("Expected subdir/claude.한국어.md to be categorized as a guidance file")
+		}
+	})
+}
+
+func TestCategorizeFile(t *testing.T) {
+	t.Run("Non-ASCII Filenames", func(t *testing.T) {
+		tests := []struct {
+			name     string
+			path     string
+			expected string
+		}{
+			{"Chinese Go file", "测试文件.go", ""},
+			{"French JS file", "café.js", ""},
+			{"Russian Python file", "русский.py", ""},
+			{"Emoji markdown file", "🚀rocket.md", ""},
+			{"German HTML file", "Übung.html", ""},
+			{"Japanese Makefile", "Makefile-日本語", "build"},
+			{"Spanish README", "readme-español.md", "documentation"},
+			{"Korean Claude file", "subdir/claude.한국어.md", "guidance"},
+			// Test edge cases with Unicode normalization and combining characters
+			{"Mixed Unicode file", "test中文🚀.txt", ""},
+			{"Combining characters", "filé̂.go", ""}, // file with combining acute and circumflex accents
+			{"Right-to-left script", "مرحبا.py", ""},  // Arabic "hello"
+		}
+
+		for _, tt := range tests {
+			t.Run(tt.name, func(t *testing.T) {
+				result := categorizeFile(tt.path)
+				if result != tt.expected {
+					t.Errorf("categorizeFile(%q) = %q, want %q", tt.path, result, tt.expected)
+				}
+			})
+		}
+	})
+}
+
+func TestTopExtensions(t *testing.T) {
+	t.Run("With Non-ASCII Files", func(t *testing.T) {
+		// Create a test codebase with known extension counts
+		codebase := &Codebase{
+			ExtensionCounts: map[string]int{
+				".md":   5, // Most common
+				".go":   3,
+				".js":   2,
+				".py":   1,
+				".html": 1, // Least common
+			},
+			TotalFiles: 12,
+		}
+
+		topExt := codebase.TopExtensions()
+		if len(topExt) != 5 {
+			t.Errorf("Expected 5 top extensions, got %d", len(topExt))
+		}
+
+		// Check that extensions are sorted by count (descending)
+		expected := []string{
+			".md: 5 (42%)",
+			".go: 3 (25%)",
+			".js: 2 (17%)",
+			".html: 1 (8%)",
+			".py: 1 (8%)",
+		}
+
+		for i, expectedExt := range expected {
+			if i >= len(topExt) {
+				t.Errorf("Missing expected extension at index %d: %s", i, expectedExt)
+				continue
+			}
+			if topExt[i] != expectedExt {
+				t.Errorf("Expected extension %q at index %d, got %q", expectedExt, i, topExt[i])
+			}
+		}
+	})
+}

diff --git "a/claudetool/onstart/testdata/Makefile-\346\227\245\346\234\254\350\252\236" "b/claudetool/onstart/testdata/Makefile-\346\227\245\346\234\254\350\252\236"
new file mode 100644
index 0000000..c78f45a
--- /dev/null
+++ "b/claudetool/onstart/testdata/Makefile-\346\227\245\346\234\254\350\252\236"

@@ -0,0 +1,10 @@
+# Makefile with Japanese characters in filename
+# This should be categorized as a build file
+
+all:
+	echo "Building with Japanese characters in Makefile name"
+
+clean:
+	rm -f *.o
+
+.PHONY: all clean

diff --git "a/claudetool/onstart/testdata/caf\303\251.js" "b/claudetool/onstart/testdata/caf\303\251.js"
new file mode 100644
index 0000000..6cd53e3
--- /dev/null
+++ "b/claudetool/onstart/testdata/caf\303\251.js"

@@ -0,0 +1,2 @@
+// JavaScript file with French accent in filename
+console.log('Hello from café.js');

diff --git "a/claudetool/onstart/testdata/readme-espa\303\261ol.md" "b/claudetool/onstart/testdata/readme-espa\303\261ol.md"
new file mode 100644
index 0000000..4232509
--- /dev/null
+++ "b/claudetool/onstart/testdata/readme-espa\303\261ol.md"

@@ -0,0 +1,8 @@
+# README Español
+
+This is a documentation file with Spanish characters in the filename.
+
+## Características
+
+- Soporte para Unicode
+- Caracteres españoles en nombres de archivo

diff --git "a/claudetool/onstart/testdata/subdir/claude.\355\225\234\352\265\255\354\226\264.md" "b/claudetool/onstart/testdata/subdir/claude.\355\225\234\352\265\255\354\226\264.md"
new file mode 100644
index 0000000..ab36b92
--- /dev/null
+++ "b/claudetool/onstart/testdata/subdir/claude.\355\225\234\352\265\255\354\226\264.md"

@@ -0,0 +1,8 @@
+# Claude Guidance with Korean Characters
+
+This file should be categorized as a guidance file since it starts with 'claude-' and ends with '.md'.
+
+## 지침
+
+- 한국어 문자 지원
+- 파일 이름에 유니코드 사용

diff --git "a/claudetool/onstart/testdata/\303\234bung.html" "b/claudetool/onstart/testdata/\303\234bung.html"
new file mode 100644
index 0000000..afc66ab
--- /dev/null
+++ "b/claudetool/onstart/testdata/\303\234bung.html"

@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>German Umlaut Test</title>
+</head>
+<body>
+    <h1>Übung HTML File</h1>
+    <p>This HTML file has German umlaut characters in the filename.</p>
+</body>
+</html>

diff --git "a/claudetool/onstart/testdata/\321\200\321\203\321\201\321\201\320\272\320\270\320\271.py" "b/claudetool/onstart/testdata/\321\200\321\203\321\201\321\201\320\272\320\270\320\271.py"
new file mode 100644
index 0000000..8110369
--- /dev/null
+++ "b/claudetool/onstart/testdata/\321\200\321\203\321\201\321\201\320\272\320\270\320\271.py"

@@ -0,0 +1,2 @@
+# Python file with Russian characters in filename
+print('Hello from русский.py')

diff --git "a/claudetool/onstart/testdata/\346\265\213\350\257\225\346\226\207\344\273\266.go" "b/claudetool/onstart/testdata/\346\265\213\350\257\225\346\226\207\344\273\266.go"
new file mode 100644
index 0000000..30a3d59
--- /dev/null
+++ "b/claudetool/onstart/testdata/\346\265\213\350\257\225\346\226\207\344\273\266.go"

@@ -0,0 +1,6 @@
+// Package test with Chinese characters in filename
+package test
+
+func TestFunction() {
+	// This is a test file with Chinese characters in the filename
+}

diff --git "a/claudetool/onstart/testdata/\360\237\232\200rocket.md" "b/claudetool/onstart/testdata/\360\237\232\200rocket.md"
new file mode 100644
index 0000000..6140a51
--- /dev/null
+++ "b/claudetool/onstart/testdata/\360\237\232\200rocket.md"

@@ -0,0 +1,8 @@
+# README with Emoji in filename
+
+This is a documentation file with an emoji character in the filename.
+
+## Features
+
+- Unicode support
+- Emoji in filenames
commit	f1e517d64fe4a726552f5d240a1ecb3d418f16b6	[log] [tgz]
author	Marc-Antoine Ruel <maruel@gmail.com>	Sun Jun 08 17:30:37 2025 +0000
committer	Philip Zeyliger <philip.zeyliger@gmail.com>	Sun Jun 08 13:08:59 2025 -0700
tree	ace5b25920201d79054a8b3f7cc17cf884fae8c5
parent	de19aca257ab21956f2fba828d9265ef218687da [diff]