fglock · fglock · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.claude/settings.json b/.claude/settings.json
@@ -0,0 +1,41 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(grep -n *)",
+      "Bash(grep -r *)",
+      "Bash(ls -la *)",
+      "Bash(grep -rn *)",
+      "Bash(find . *)",
+      "Bash(git log *)",
+      "Bash(git status *)",
+      "Bash(grep -A *)",
+      "Bash(ls -lh *)",
+      "Bash(git diff *)",
+      "Bash(find /Users/fglock/projects/PerlOnJava4 *)",
+      "Bash(find /Users/fglock/projects/PerlOnJava4/src/main/java *)",
+      "Bash(git branch *)",
+      "Bash(find ~/.claude/projects/ *)",
+      "Bash(wc -l *)",
+      "Bash(grep -B5 *)",
+      "Bash(grep -A5 *)",
+      "Bash(git show *)",
+      "Bash(ps aux *)",
+      "Bash(find /Users/fglock/projects/PerlOnJava4/src/main/perl/lib *)",
+      "Bash(grep -B *)",
+      "Bash(find ~/.perlonjava *)",
+      "Bash(find ~/.cpan *)",
+      "Bash(find /Users/fglock/projects/PerlOnJava4/src *)",
+      "Bash(find /Users/fglock/projects/PerlOnJava4/examples *)",
+      "Bash(ls -la ~/.cpan *)",
+      "Bash(ls -la ~/.perlonjava *)",
+      "Bash(cat ~/.cpan *)",
+      "Bash(cat ~/.perlonjava *)",
+      "Bash(./jcpan -t *)",
+      "Bash(timeout *)",
+      "Bash(until grep *)",
+      "Bash(ls -la /tmp *)",
+      "Bash(cat /tmp *)",
+      "Read(~/.perlonjava/*)"
+    ]
+  }
+}
diff --git a/dev/modules/sub_handlesviacontinuation.md b/dev/modules/sub_handlesviacontinuation.md
@@ -0,0 +1,62 @@
+# Sub::HandlesVia UTF-8 Fix
+
+**Status**: Solution implemented - Eval-time UTF-8 repair applied to both interpreter and JVM compilation paths
+
+## Problem
+
+When Sub::HandlesVia generates accessor delegation code, orphaned UTF-8 lead bytes (0xC0-0xDF, 0xE0-0xEF, 0xF0-0xF7) appear in generated Perl code, causing syntax errors:
+
+```
+Global symbol "@shv_tmp\x{c2}" requires explicit package name
+Unrecognized character \x{c2}; at /Users/fglock/.perlonjava/lib/Eval/TypeTiny.pm line 8
+syntax error at set_option=Hash:set line 5, near "\"Wrong number "
+```
+
+## Root Cause Analysis
+
+The corruption stems from UTF-8/Latin-1 encoding mismatch:
+
+1. UTF-8 files may be decoded as Latin-1, leaving multi-byte sequences as orphaned high bytes
+2. When Perl code (like Sub::HandlesVia::CodeGenerator) does string concatenation to generate methods, if the source strings contain multi-byte UTF-8 sequences being interpreted as Latin-1, the corruption persists
+3. The orphaned bytes end up in generated Perl code that is eval'd at runtime, breaking parsing
+
+## Solution (branch: `fix/sub-handlesvia-utf8`)
+
+### Implementation - Eval-time Corruption Repair
+
+Apply UTF-8 corruption repair in BOTH eval paths:
+1. **EvalStringHandler** (interpreter path)  - commit d7f725e27
+2. **RuntimeCode.evalStringHelper** (JVM compilation path) - commit a436b95ed
+
+**Why eval-time repair?**
+- Sub::HandlesVia generates code via Perl string concatenation, not regex substitutions
+- Previous regex-only repair missed this code generation path
+- By repairing ALL eval'd code before the Lexer processes it, we catch corruption from all sources
+
+**Changes**:
+- Made `RuntimeRegex.repairLatin1EncodedUtf8IfCorrupted()` public and static
+- Added repair call in EvalStringHandler before parsing (Step 1b)
+- Added repair call in RuntimeCode.evalStringHelper before Lexer/Parser
+- Orphaned lead bytes are removed, allowing valid parsing
+
+## Test Results
+
+Testing with renewed build of PerlOnJava with UTF-8 repairs applied to both eval paths.
+
+## Technical Details
+
+The corruption repair function:
+- Scans for orphaned UTF-8 lead bytes (0xC0-0xDF, 0xE0-0xEF, 0xF0-0xF7)
+- Verifies proper continuation byte sequences (0x80-0xBF)
+- Removes orphaned lead bytes while preserving valid multi-byte sequences
+- Keeps ASCII and properly-formed UTF-8 intact
+
+## Related Commits
+
+- `a436b95ed`: Apply UTF-8 repair in RuntimeCode.evalStringHelper - JVM path
+- `d7f725e27`: Apply UTF-8 repair in EvalStringHandler - Interpreter path
+- `d50c2387b`: Document revert of UTF-8 file encoding preference
+- `12222348b`: Revert non-standard UTF-8 preference (keep Perl 5 standard)
+- `7e487f2f5`: Extended UTF-8 lead byte repair (regex-only, earlier approach)
+
+
diff --git a/src/main/java/org/perlonjava/backend/bytecode/CompileExistsDelete.java b/src/main/java/org/perlonjava/backend/bytecode/CompileExistsDelete.java
@@ -144,8 +144,19 @@ private static void visitDeleteHashSlice(BytecodeCompiler bc, OperatorNode node,
                 bc.emitReg(hashReg);
                 bc.emit(nameIdx);
             }
+        } else if (leftOp.operand instanceof BlockNode block) {
+            // Handle expression-based delete: delete @{$expr}{@keys}
+            // Compile the block to get a hash reference
+            bc.compileNode(block, -1, RuntimeContextType.SCALAR);
+            int refReg = bc.lastResultReg;
+
+            // Dereference to get the actual hash
+            hashReg = bc.allocateRegister();
+            bc.emit(Opcodes.DEREF_HASH);
+            bc.emitReg(hashReg);
+            bc.emitReg(refReg);
         } else {
-            bc.throwCompilerException("Hash slice delete requires identifier");
+            bc.throwCompilerException("Hash slice delete requires identifier or expression");
             return;
         }
         if (!(hashAccess.right instanceof HashLiteralNode keysNode)) {
@@ -196,8 +207,19 @@ private static void visitDeleteHashKVSlice(BytecodeCompiler bc, OperatorNode nod
                 bc.emitReg(hashReg);
                 bc.emit(nameIdx);
             }
+        } else if (leftOp.operand instanceof BlockNode block) {
+            // Handle expression-based delete: delete @{$expr}{@keys}
+            // Compile the block to get a hash reference
+            bc.compileNode(block, -1, RuntimeContextType.SCALAR);
+            int refReg = bc.lastResultReg;
+
+            // Dereference to get the actual hash
+            hashReg = bc.allocateRegister();
+            bc.emit(Opcodes.DEREF_HASH);
+            bc.emitReg(hashReg);
+            bc.emitReg(refReg);
         } else {
-            bc.throwCompilerException("Hash kv-slice delete requires identifier");
+            bc.throwCompilerException("Hash kv-slice delete requires identifier or expression");
             return;
         }
         if (!(hashAccess.right instanceof HashLiteralNode keysNode)) {
@@ -289,8 +311,19 @@ private static void visitDeleteArraySlice(BytecodeCompiler bc, OperatorNode node
                 bc.emitReg(arrayReg);
                 bc.emit(nameIdx);
             }
+        } else if (leftOp.operand instanceof BlockNode block) {
+            // Handle expression-based delete: delete @{$expr}[@indices]
+            // Compile the block to get an array reference
+            bc.compileNode(block, -1, RuntimeContextType.SCALAR);
+            int refReg = bc.lastResultReg;
+
+            // Dereference to get the actual array
+            arrayReg = bc.allocateRegister();
+            bc.emit(Opcodes.DEREF_ARRAY);
+            bc.emitReg(arrayReg);
+            bc.emitReg(refReg);
         } else {
-            bc.throwCompilerException("Array slice delete requires identifier");
+            bc.throwCompilerException("Array slice delete requires identifier or expression");
             return;
         }
         if (!(arrayAccess.right instanceof ArrayLiteralNode indicesNode)) {
@@ -332,8 +365,19 @@ private static void visitDeleteArrayKVSlice(BytecodeCompiler bc, OperatorNode no
                 bc.emitReg(arrayReg);
                 bc.emit(nameIdx);
             }
+        } else if (leftOp.operand instanceof BlockNode block) {
+            // Handle expression-based delete: delete @{$expr}[@indices]
+            // Compile the block to get an array reference
+            bc.compileNode(block, -1, RuntimeContextType.SCALAR);
+            int refReg = bc.lastResultReg;
+
+            // Dereference to get the actual array
+            arrayReg = bc.allocateRegister();
+            bc.emit(Opcodes.DEREF_ARRAY);
+            bc.emitReg(arrayReg);
+            bc.emitReg(refReg);
         } else {
-            bc.throwCompilerException("Array kv-slice delete requires identifier");
+            bc.throwCompilerException("Array kv-slice delete requires identifier or expression");
             return;
         }
         if (!(arrayAccess.right instanceof ArrayLiteralNode indicesNode)) {

diff --git a/src/main/java/org/perlonjava/backend/bytecode/EvalStringHandler.java b/src/main/java/org/perlonjava/backend/bytecode/EvalStringHandler.java
@@ -10,6 +10,7 @@
 import org.perlonjava.frontend.parser.Parser;
 import org.perlonjava.frontend.semantic.ScopedSymbolTable;
 import org.perlonjava.runtime.operators.WarnDie;
+import org.perlonjava.runtime.regex.RuntimeRegex;
 import org.perlonjava.runtime.runtimetypes.*;
 
 import java.util.ArrayList;
@@ -110,6 +111,13 @@ public static RuntimeList evalStringList(String perlCode,
             // Step 1: Clear $@ at start of eval
             GlobalVariable.getGlobalVariable("main::@").set("");
 
+            // Step 1b: Repair orphaned UTF-8 lead bytes in eval'd code
+            // This handles cases where Sub::HandlesVia and other Perl modules generate code
+            // with Latin-1 misencoded multi-byte UTF-8 sequences
+            if (perlCode != null && !perlCode.isEmpty()) {
+                perlCode = RuntimeRegex.repairLatin1EncodedUtf8IfCorrupted(perlCode);
+            }
+
             // Step 2: Parse the string to AST
             Lexer lexer = new Lexer(perlCode);
             List<LexerToken> tokens = lexer.tokenize();
@@ -124,6 +132,21 @@ public static RuntimeList evalStringList(String perlCode,
 
             CompilerOptions opts = new CompilerOptions();
             opts.fileName = evalFileName;
+
+            // Detect if eval string contains UTF-8 characters and set isUnicodeSource
+            // This allows identifiers with UTF-8 characters (e.g., guillemets « » from CodeGenerator)
+            // to be properly recognized instead of being rejected as "Unrecognized character"
+            if (perlCode != null && !perlCode.isEmpty()) {
+                for (int i = 0; i < perlCode.length(); i++) {
+                    if (perlCode.charAt(i) > 127) {
+                        opts.isUnicodeSource = true;
+                        evalTrace("Detected UTF-8 in eval at char " + i +
+                            ", first non-ASCII char: U+" + Integer.toHexString(perlCode.charAt(i)).toUpperCase());
+                        break;
+                    }
+                }
+            }
+
             ScopedSymbolTable symbolTable = new ScopedSymbolTable();
 
             // Add standard variables that are always available in eval context.
@@ -395,6 +418,21 @@ public static RuntimeScalar evalString(String perlCode,
 
             CompilerOptions opts = new CompilerOptions();
             opts.fileName = evalFileName;
+
+            // Detect if eval string contains UTF-8 characters and set isUnicodeSource
+            // This allows identifiers with UTF-8 characters (e.g., guillemets « » from CodeGenerator)
+            // to be properly recognized instead of being rejected as "Unrecognized character"
+            if (perlCode != null && !perlCode.isEmpty()) {
+                for (int i = 0; i < perlCode.length(); i++) {
+                    if (perlCode.charAt(i) > 127) {
+                        opts.isUnicodeSource = true;
+                        evalTrace("Detected UTF-8 in eval at char " + i +
+                            ", first non-ASCII char: U+" + Integer.toHexString(perlCode.charAt(i)).toUpperCase());
+                        break;
+                    }
+                }
+            }
+
             ScopedSymbolTable symbolTable = new ScopedSymbolTable();
 
             // Add standard variables that are always available in eval context.

diff --git a/src/main/java/org/perlonjava/frontend/parser/IdentifierParser.java b/src/main/java/org/perlonjava/frontend/parser/IdentifierParser.java
@@ -169,19 +169,31 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
         // In `no utf8` mode (or `evalbytes`), Perl still allows many non-ASCII bytes as length-1 variables,
         // but it must reject whitespace-like bytes and format/control bytes. Additionally, for length-2+
         // identifiers, non-ASCII bytes are not allowed.
-        boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
+        // Note: isUnicodeSource is set when eval'd code contains UTF-8 characters, even without `use utf8`
+        boolean utf8Enabled = (parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
+                || parser.ctx.compilerOptions.isUnicodeSource)
                 && !parser.ctx.compilerOptions.isEvalbytes;
 
-        if (!utf8Enabled && token.type == LexerTokenType.IDENTIFIER) {
+        if (token.type == LexerTokenType.IDENTIFIER) {
             // The Lexer may have greedily consumed non-ASCII identifier parts into a single IDENTIFIER token.
-            // Under `no utf8` / `evalbytes`, those are not allowed for length-2+ variables.
+            // Under `no utf8` / `evalbytes`, non-ASCII letters/digits are not allowed for length-2+ variables.
+            // Under normal operation (utf8 enabled), non-ASCII letters/digits ARE allowed but control chars are not.
             String id = token.text;
             if (id.length() > 1) {
                 for (int i = 0; i < id.length(); ) {
                     int cp = id.codePointAt(i);
                     if (cp > 127) {
-                        String hex = "\\x{" + Integer.toHexString(cp) + "}";
-                        throw new PerlCompilerException("Unrecognized character " + hex + ";");
+                        // In no-utf8 mode, reject all non-ASCII characters
+                        if (!utf8Enabled) {
+                            String hex = "\\x{" + Integer.toHexString(cp) + "}";
+                            throw new PerlCompilerException("Unrecognized character " + hex + ";");
+                        }
+                        // In utf8 mode, allow UTF-8 identifier characters (use UCharacter for proper Unicode support)
+                        // UCharacter.hasBinaryProperty with XID_CONTINUE is the proper test for identifier parts
+                        if (!(cp == '_' || UCharacter.hasBinaryProperty(cp, UProperty.XID_CONTINUE))) {
+                            String hex = "\\x{" + Integer.toHexString(cp) + "}";
+                            throw new PerlCompilerException("Unrecognized character " + hex + ";");
+                        }
                     }
                     i += Character.charCount(cp);
                 }

diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Universal.java b/src/main/java/org/perlonjava/runtime/perlmodule/Universal.java
@@ -155,7 +155,7 @@ public static RuntimeList can(RuntimeArray args, int ctx) {
             if (method != null && !isAutoloadDispatch(method, actualMethod, perlClassName)) {
                 return method.getList();
             }
-            return new RuntimeList();
+            return scalarUndef.getList();
         }
 
         // Handle Package::SUPER::method syntax
@@ -168,7 +168,7 @@ public static RuntimeList can(RuntimeArray args, int ctx) {
             if (method != null && !isAutoloadDispatch(method, actualMethod, packageName)) {
                 return method.getList();
             }
-            return new RuntimeList();
+            return scalarUndef.getList();
         }
 
         // Perl's can() must NOT consider AUTOLOAD - it should only find
@@ -219,7 +219,8 @@ public static RuntimeList can(RuntimeArray args, int ctx) {
                 return method.getList();
             }
         }
-        return new RuntimeList();
+        // Return undef (not empty list) when method not found
+        return scalarUndef.getList();
     }
 
     /**