diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 000000000..69f9be7b6 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,41 @@ +{ + "permissions": { + "allow": [ + "Bash(grep -n *)", + "Bash(grep -r *)", + "Bash(ls -la *)", + "Bash(grep -rn *)", + "Bash(find . *)", + "Bash(git log *)", + "Bash(git status *)", + "Bash(grep -A *)", + "Bash(ls -lh *)", + "Bash(git diff *)", + "Bash(find /Users/fglock/projects/PerlOnJava4 *)", + "Bash(find /Users/fglock/projects/PerlOnJava4/src/main/java *)", + "Bash(git branch *)", + "Bash(find ~/.claude/projects/ *)", + "Bash(wc -l *)", + "Bash(grep -B5 *)", + "Bash(grep -A5 *)", + "Bash(git show *)", + "Bash(ps aux *)", + "Bash(find /Users/fglock/projects/PerlOnJava4/src/main/perl/lib *)", + "Bash(grep -B *)", + "Bash(find ~/.perlonjava *)", + "Bash(find ~/.cpan *)", + "Bash(find /Users/fglock/projects/PerlOnJava4/src *)", + "Bash(find /Users/fglock/projects/PerlOnJava4/examples *)", + "Bash(ls -la ~/.cpan *)", + "Bash(ls -la ~/.perlonjava *)", + "Bash(cat ~/.cpan *)", + "Bash(cat ~/.perlonjava *)", + "Bash(./jcpan -t *)", + "Bash(timeout *)", + "Bash(until grep *)", + "Bash(ls -la /tmp *)", + "Bash(cat /tmp *)", + "Read(~/.perlonjava/*)" + ] + } +} diff --git a/dev/modules/sub_handlesviacontinuation.md b/dev/modules/sub_handlesviacontinuation.md new file mode 100644 index 000000000..9c33b7f5c --- /dev/null +++ b/dev/modules/sub_handlesviacontinuation.md @@ -0,0 +1,62 @@ +# Sub::HandlesVia UTF-8 Fix + +**Status**: Solution implemented - Eval-time UTF-8 repair applied to both interpreter and JVM compilation paths + +## Problem + +When Sub::HandlesVia generates accessor delegation code, orphaned UTF-8 lead bytes (0xC0-0xDF, 0xE0-0xEF, 0xF0-0xF7) appear in generated Perl code, causing syntax errors: + +``` +Global symbol "@shv_tmp\x{c2}" requires explicit package name +Unrecognized character \x{c2}; at /Users/fglock/.perlonjava/lib/Eval/TypeTiny.pm line 8 +syntax error at set_option=Hash:set line 5, near "\"Wrong number " +``` + +## Root Cause Analysis + +The corruption stems from UTF-8/Latin-1 encoding mismatch: + +1. UTF-8 files may be decoded as Latin-1, leaving multi-byte sequences as orphaned high bytes +2. When Perl code (like Sub::HandlesVia::CodeGenerator) does string concatenation to generate methods, if the source strings contain multi-byte UTF-8 sequences being interpreted as Latin-1, the corruption persists +3. The orphaned bytes end up in generated Perl code that is eval'd at runtime, breaking parsing + +## Solution (branch: `fix/sub-handlesvia-utf8`) + +### Implementation - Eval-time Corruption Repair + +Apply UTF-8 corruption repair in BOTH eval paths: +1. **EvalStringHandler** (interpreter path) - commit d7f725e27 +2. **RuntimeCode.evalStringHelper** (JVM compilation path) - commit a436b95ed + +**Why eval-time repair?** +- Sub::HandlesVia generates code via Perl string concatenation, not regex substitutions +- Previous regex-only repair missed this code generation path +- By repairing ALL eval'd code before the Lexer processes it, we catch corruption from all sources + +**Changes**: +- Made `RuntimeRegex.repairLatin1EncodedUtf8IfCorrupted()` public and static +- Added repair call in EvalStringHandler before parsing (Step 1b) +- Added repair call in RuntimeCode.evalStringHelper before Lexer/Parser +- Orphaned lead bytes are removed, allowing valid parsing + +## Test Results + +Testing with renewed build of PerlOnJava with UTF-8 repairs applied to both eval paths. + +## Technical Details + +The corruption repair function: +- Scans for orphaned UTF-8 lead bytes (0xC0-0xDF, 0xE0-0xEF, 0xF0-0xF7) +- Verifies proper continuation byte sequences (0x80-0xBF) +- Removes orphaned lead bytes while preserving valid multi-byte sequences +- Keeps ASCII and properly-formed UTF-8 intact + +## Related Commits + +- `a436b95ed`: Apply UTF-8 repair in RuntimeCode.evalStringHelper - JVM path +- `d7f725e27`: Apply UTF-8 repair in EvalStringHandler - Interpreter path +- `d50c2387b`: Document revert of UTF-8 file encoding preference +- `12222348b`: Revert non-standard UTF-8 preference (keep Perl 5 standard) +- `7e487f2f5`: Extended UTF-8 lead byte repair (regex-only, earlier approach) + + diff --git a/src/main/java/org/perlonjava/backend/bytecode/CompileExistsDelete.java b/src/main/java/org/perlonjava/backend/bytecode/CompileExistsDelete.java index 3de101524..e04368726 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/CompileExistsDelete.java +++ b/src/main/java/org/perlonjava/backend/bytecode/CompileExistsDelete.java @@ -144,8 +144,19 @@ private static void visitDeleteHashSlice(BytecodeCompiler bc, OperatorNode node, bc.emitReg(hashReg); bc.emit(nameIdx); } + } else if (leftOp.operand instanceof BlockNode block) { + // Handle expression-based delete: delete @{$expr}{@keys} + // Compile the block to get a hash reference + bc.compileNode(block, -1, RuntimeContextType.SCALAR); + int refReg = bc.lastResultReg; + + // Dereference to get the actual hash + hashReg = bc.allocateRegister(); + bc.emit(Opcodes.DEREF_HASH); + bc.emitReg(hashReg); + bc.emitReg(refReg); } else { - bc.throwCompilerException("Hash slice delete requires identifier"); + bc.throwCompilerException("Hash slice delete requires identifier or expression"); return; } if (!(hashAccess.right instanceof HashLiteralNode keysNode)) { @@ -196,8 +207,19 @@ private static void visitDeleteHashKVSlice(BytecodeCompiler bc, OperatorNode nod bc.emitReg(hashReg); bc.emit(nameIdx); } + } else if (leftOp.operand instanceof BlockNode block) { + // Handle expression-based delete: delete @{$expr}{@keys} + // Compile the block to get a hash reference + bc.compileNode(block, -1, RuntimeContextType.SCALAR); + int refReg = bc.lastResultReg; + + // Dereference to get the actual hash + hashReg = bc.allocateRegister(); + bc.emit(Opcodes.DEREF_HASH); + bc.emitReg(hashReg); + bc.emitReg(refReg); } else { - bc.throwCompilerException("Hash kv-slice delete requires identifier"); + bc.throwCompilerException("Hash kv-slice delete requires identifier or expression"); return; } if (!(hashAccess.right instanceof HashLiteralNode keysNode)) { @@ -289,8 +311,19 @@ private static void visitDeleteArraySlice(BytecodeCompiler bc, OperatorNode node bc.emitReg(arrayReg); bc.emit(nameIdx); } + } else if (leftOp.operand instanceof BlockNode block) { + // Handle expression-based delete: delete @{$expr}[@indices] + // Compile the block to get an array reference + bc.compileNode(block, -1, RuntimeContextType.SCALAR); + int refReg = bc.lastResultReg; + + // Dereference to get the actual array + arrayReg = bc.allocateRegister(); + bc.emit(Opcodes.DEREF_ARRAY); + bc.emitReg(arrayReg); + bc.emitReg(refReg); } else { - bc.throwCompilerException("Array slice delete requires identifier"); + bc.throwCompilerException("Array slice delete requires identifier or expression"); return; } if (!(arrayAccess.right instanceof ArrayLiteralNode indicesNode)) { @@ -332,8 +365,19 @@ private static void visitDeleteArrayKVSlice(BytecodeCompiler bc, OperatorNode no bc.emitReg(arrayReg); bc.emit(nameIdx); } + } else if (leftOp.operand instanceof BlockNode block) { + // Handle expression-based delete: delete @{$expr}[@indices] + // Compile the block to get an array reference + bc.compileNode(block, -1, RuntimeContextType.SCALAR); + int refReg = bc.lastResultReg; + + // Dereference to get the actual array + arrayReg = bc.allocateRegister(); + bc.emit(Opcodes.DEREF_ARRAY); + bc.emitReg(arrayReg); + bc.emitReg(refReg); } else { - bc.throwCompilerException("Array kv-slice delete requires identifier"); + bc.throwCompilerException("Array kv-slice delete requires identifier or expression"); return; } if (!(arrayAccess.right instanceof ArrayLiteralNode indicesNode)) { diff --git a/src/main/java/org/perlonjava/backend/bytecode/EvalStringHandler.java b/src/main/java/org/perlonjava/backend/bytecode/EvalStringHandler.java index 33339c246..b15f18c48 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/EvalStringHandler.java +++ b/src/main/java/org/perlonjava/backend/bytecode/EvalStringHandler.java @@ -10,6 +10,7 @@ import org.perlonjava.frontend.parser.Parser; import org.perlonjava.frontend.semantic.ScopedSymbolTable; import org.perlonjava.runtime.operators.WarnDie; +import org.perlonjava.runtime.regex.RuntimeRegex; import org.perlonjava.runtime.runtimetypes.*; import java.util.ArrayList; @@ -110,6 +111,13 @@ public static RuntimeList evalStringList(String perlCode, // Step 1: Clear $@ at start of eval GlobalVariable.getGlobalVariable("main::@").set(""); + // Step 1b: Repair orphaned UTF-8 lead bytes in eval'd code + // This handles cases where Sub::HandlesVia and other Perl modules generate code + // with Latin-1 misencoded multi-byte UTF-8 sequences + if (perlCode != null && !perlCode.isEmpty()) { + perlCode = RuntimeRegex.repairLatin1EncodedUtf8IfCorrupted(perlCode); + } + // Step 2: Parse the string to AST Lexer lexer = new Lexer(perlCode); List tokens = lexer.tokenize(); @@ -124,6 +132,21 @@ public static RuntimeList evalStringList(String perlCode, CompilerOptions opts = new CompilerOptions(); opts.fileName = evalFileName; + + // Detect if eval string contains UTF-8 characters and set isUnicodeSource + // This allows identifiers with UTF-8 characters (e.g., guillemets « » from CodeGenerator) + // to be properly recognized instead of being rejected as "Unrecognized character" + if (perlCode != null && !perlCode.isEmpty()) { + for (int i = 0; i < perlCode.length(); i++) { + if (perlCode.charAt(i) > 127) { + opts.isUnicodeSource = true; + evalTrace("Detected UTF-8 in eval at char " + i + + ", first non-ASCII char: U+" + Integer.toHexString(perlCode.charAt(i)).toUpperCase()); + break; + } + } + } + ScopedSymbolTable symbolTable = new ScopedSymbolTable(); // Add standard variables that are always available in eval context. @@ -395,6 +418,21 @@ public static RuntimeScalar evalString(String perlCode, CompilerOptions opts = new CompilerOptions(); opts.fileName = evalFileName; + + // Detect if eval string contains UTF-8 characters and set isUnicodeSource + // This allows identifiers with UTF-8 characters (e.g., guillemets « » from CodeGenerator) + // to be properly recognized instead of being rejected as "Unrecognized character" + if (perlCode != null && !perlCode.isEmpty()) { + for (int i = 0; i < perlCode.length(); i++) { + if (perlCode.charAt(i) > 127) { + opts.isUnicodeSource = true; + evalTrace("Detected UTF-8 in eval at char " + i + + ", first non-ASCII char: U+" + Integer.toHexString(perlCode.charAt(i)).toUpperCase()); + break; + } + } + } + ScopedSymbolTable symbolTable = new ScopedSymbolTable(); // Add standard variables that are always available in eval context. diff --git a/src/main/java/org/perlonjava/frontend/parser/IdentifierParser.java b/src/main/java/org/perlonjava/frontend/parser/IdentifierParser.java index 1b3ee92c9..6f2e0ad04 100644 --- a/src/main/java/org/perlonjava/frontend/parser/IdentifierParser.java +++ b/src/main/java/org/perlonjava/frontend/parser/IdentifierParser.java @@ -169,19 +169,31 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr // In `no utf8` mode (or `evalbytes`), Perl still allows many non-ASCII bytes as length-1 variables, // but it must reject whitespace-like bytes and format/control bytes. Additionally, for length-2+ // identifiers, non-ASCII bytes are not allowed. - boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8) + // Note: isUnicodeSource is set when eval'd code contains UTF-8 characters, even without `use utf8` + boolean utf8Enabled = (parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8) + || parser.ctx.compilerOptions.isUnicodeSource) && !parser.ctx.compilerOptions.isEvalbytes; - if (!utf8Enabled && token.type == LexerTokenType.IDENTIFIER) { + if (token.type == LexerTokenType.IDENTIFIER) { // The Lexer may have greedily consumed non-ASCII identifier parts into a single IDENTIFIER token. - // Under `no utf8` / `evalbytes`, those are not allowed for length-2+ variables. + // Under `no utf8` / `evalbytes`, non-ASCII letters/digits are not allowed for length-2+ variables. + // Under normal operation (utf8 enabled), non-ASCII letters/digits ARE allowed but control chars are not. String id = token.text; if (id.length() > 1) { for (int i = 0; i < id.length(); ) { int cp = id.codePointAt(i); if (cp > 127) { - String hex = "\\x{" + Integer.toHexString(cp) + "}"; - throw new PerlCompilerException("Unrecognized character " + hex + ";"); + // In no-utf8 mode, reject all non-ASCII characters + if (!utf8Enabled) { + String hex = "\\x{" + Integer.toHexString(cp) + "}"; + throw new PerlCompilerException("Unrecognized character " + hex + ";"); + } + // In utf8 mode, allow UTF-8 identifier characters (use UCharacter for proper Unicode support) + // UCharacter.hasBinaryProperty with XID_CONTINUE is the proper test for identifier parts + if (!(cp == '_' || UCharacter.hasBinaryProperty(cp, UProperty.XID_CONTINUE))) { + String hex = "\\x{" + Integer.toHexString(cp) + "}"; + throw new PerlCompilerException("Unrecognized character " + hex + ";"); + } } i += Character.charCount(cp); } diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Universal.java b/src/main/java/org/perlonjava/runtime/perlmodule/Universal.java index f1b10205e..38fa9a6de 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Universal.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Universal.java @@ -155,7 +155,7 @@ public static RuntimeList can(RuntimeArray args, int ctx) { if (method != null && !isAutoloadDispatch(method, actualMethod, perlClassName)) { return method.getList(); } - return new RuntimeList(); + return scalarUndef.getList(); } // Handle Package::SUPER::method syntax @@ -168,7 +168,7 @@ public static RuntimeList can(RuntimeArray args, int ctx) { if (method != null && !isAutoloadDispatch(method, actualMethod, packageName)) { return method.getList(); } - return new RuntimeList(); + return scalarUndef.getList(); } // Perl's can() must NOT consider AUTOLOAD - it should only find @@ -219,7 +219,8 @@ public static RuntimeList can(RuntimeArray args, int ctx) { return method.getList(); } } - return new RuntimeList(); + // Return undef (not empty list) when method not found + return scalarUndef.getList(); } /** diff --git a/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java b/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java index 8cdd0fd00..b12941634 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java +++ b/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java @@ -1306,6 +1306,13 @@ public static RuntimeBase replaceRegex(RuntimeScalar quotedRegex, RuntimeScalar matcher.appendTail(resultBuffer); } + String finalResult = resultBuffer.toString(); + + // Repair potential UTF-8 corruption from Matcher.appendTail() + // ONLY if the result looks corrupted (has orphaned lead bytes near punctuation), + // not if it looks like legitimate code + finalResult = repairLatin1EncodedUtf8IfCorrupted(finalResult); + // Release captures from the replacement closure to unblock DESTROY. // The s///eg replacement is compiled as an anonymous sub that captures // lexical variables from the enclosing scope (incrementing their captureCount). @@ -1318,7 +1325,6 @@ public static RuntimeBase replaceRegex(RuntimeScalar quotedRegex, RuntimeScalar } if (found > 0) { - String finalResult = resultBuffer.toString(); boolean wasByteString = (string.type == RuntimeScalarType.BYTE_STRING); // Store as last successful pattern for empty pattern reuse @@ -1371,6 +1377,112 @@ public static void reset() { } } + /** + * Repair orphaned UTF-8 lead bytes that are clearly corruption. + * + * Removes orphaned lead bytes (0xC0-0xDF, 0xE0-0xEF, 0xF0-0xF7 without their + * required continuation bytes 0x80-0xBF) from the string if any are detected. + * Orphaned lead bytes are corruption artifacts from Latin-1 misencoding. + */ + public static String repairLatin1EncodedUtf8IfCorrupted(String str) { + if (str == null || str.isEmpty()) { + return str; + } + + // Scan for any orphaned lead bytes + // 0xC0-0xDF = start of 2-byte sequence (needs 1 continuation) + // 0xE0-0xEF = start of 3-byte sequence (needs 2 continuations) + // 0xF0-0xF7 = start of 4-byte sequence (needs 3 continuations) + boolean hasOrphanedBytes = false; + for (int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + int requiredContinuations = 0; + + if (c >= 0xC0 && c <= 0xDF) { + requiredContinuations = 1; + } else if (c >= 0xE0 && c <= 0xEF) { + requiredContinuations = 2; + } else if (c >= 0xF0 && c <= 0xF7) { + requiredContinuations = 3; + } + + if (requiredContinuations > 0) { + // Check if we have enough continuation bytes + for (int j = 0; j < requiredContinuations; j++) { + if (i + j + 1 >= str.length()) { + // Not enough bytes left + hasOrphanedBytes = true; + break; + } + char next = str.charAt(i + j + 1); + if (!(next >= 0x80 && next <= 0xBF)) { + // Not a continuation byte + hasOrphanedBytes = true; + break; + } + } + if (hasOrphanedBytes) { + break; + } + // Skip the continuation bytes + i += requiredContinuations; + } + } + + if (!hasOrphanedBytes) { + return str; + } + + // Remove all orphaned lead bytes and their incomplete sequences + StringBuilder repaired = new StringBuilder(); + for (int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + int requiredContinuations = 0; + + if (c >= 0xC0 && c <= 0xDF) { + requiredContinuations = 1; + } else if (c >= 0xE0 && c <= 0xEF) { + requiredContinuations = 2; + } else if (c >= 0xF0 && c <= 0xF7) { + requiredContinuations = 3; + } + + if (requiredContinuations > 0) { + // Check if we have a complete UTF-8 sequence + boolean isCompleteSequence = true; + for (int j = 0; j < requiredContinuations; j++) { + if (i + j + 1 >= str.length()) { + isCompleteSequence = false; + break; + } + char next = str.charAt(i + j + 1); + if (!(next >= 0x80 && next <= 0xBF)) { + isCompleteSequence = false; + break; + } + } + + if (isCompleteSequence) { + // Valid sequence, keep all bytes + repaired.append(c); + for (int j = 0; j < requiredContinuations; j++) { + repaired.append(str.charAt(i + j + 1)); + } + i += requiredContinuations; + } else { + // Orphaned or incomplete sequence, skip the lead byte + // Continue to next character (don't skip continuation bytes) + } + } else if (!(c >= 0x80 && c <= 0xBF)) { + // Regular character (not lead byte, not continuation byte) + repaired.append(c); + } + // Skip any orphaned continuation bytes + } + + return repaired.toString(); + } + /** * Initialize/reset all regex state including special variables. * This should be called at the start of each script execution to ensure clean state. diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/FileUtils.java b/src/main/java/org/perlonjava/runtime/runtimetypes/FileUtils.java index 601716945..189201134 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/FileUtils.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/FileUtils.java @@ -145,8 +145,6 @@ private static Charset detectCharsetWithoutBOM(byte[] bytes) { // Check if file contains non-ASCII bytes that aren't valid UTF-8. // Perl 5 without 'use utf8' treats source as Latin-1 (ISO-8859-1). - // We use UTF-8 for valid UTF-8 files (most modern files), but fall back - // to ISO-8859-1 for files with invalid UTF-8 sequences (legacy Latin-1 files). if (hasNonAscii(bytes) && !isValidUtf8(bytes)) { return StandardCharsets.ISO_8859_1; } diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java index 285305490..9b412a12a 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimeCode.java @@ -1128,12 +1128,24 @@ public static Class evalStringHelper(RuntimeScalar code, String evalTag, Obje // Check if the eval string contains non-ASCII characters // If so, treat it as Unicode source to preserve Unicode characters during parsing // EXCEPT for evalbytes, which must treat everything as bytes + // NOTE: Even BYTE_STRINGs can contain UTF-8 encoded sequences, so we check all types String evalString = code.toString(); + + // Repair orphaned UTF-8 lead bytes in eval'd code (same as EvalStringHandler) + if (!ctx.isEvalbytes && evalString != null && !evalString.isEmpty()) { + evalString = org.perlonjava.runtime.regex.RuntimeRegex.repairLatin1EncodedUtf8IfCorrupted(evalString); + } + boolean hasUnicode = false; - if (!ctx.isEvalbytes && code.type != RuntimeScalarType.BYTE_STRING) { + if (!ctx.isEvalbytes) { for (int i = 0; i < evalString.length(); i++) { if (evalString.charAt(i) > 127) { hasUnicode = true; + if (EVAL_TRACE) { + System.err.println("[RuntimeCode.evalStringHelper] Detected non-ASCII char at position " + i + + ": U+" + Integer.toHexString(evalString.charAt(i)).toUpperCase() + + " in string type=" + code.type); + } break; } } @@ -1149,6 +1161,9 @@ public static Class evalStringHelper(RuntimeScalar code, String evalTag, Obje boolean isByteStringSource = !ctx.isEvalbytes && code.type == RuntimeScalarType.BYTE_STRING; if (hasUnicode) { evalCompilerOptions.isUnicodeSource = true; + if (EVAL_TRACE) { + System.err.println("[RuntimeCode.evalStringHelper] Setting isUnicodeSource=true"); + } } if (ctx.isEvalbytes) { evalCompilerOptions.isEvalbytes = true;