Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions Lib/test/test_source_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,7 @@ def test_utf8_non_utf8_third_line_error(self):
b'#third\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"'utf-8' codec can't decode byte|"
br"encoding problem: utf8")
br"'utf-8' codec can't decode byte")

def test_crlf(self):
src = (b'print(ascii("""\r\n"""))\n')
Expand Down Expand Up @@ -540,6 +539,15 @@ def check_script_error(self, src, expected, lineno=...):
line = line.removeprefix('\ufeff')
self.assertIn(line.encode(), err)

def test_coding_spec_unknown_encoding(self):
src = (b'# coding: dict-unpacking-at-home\n'

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why use such bizarre name?

@johnslavik johnslavik Jun 14, 2026

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, this is a reference to https://github.com/asottile-archive/dict-unpacking-at-home 😅 One can declare it as the encoding, and if this package is present, it rewrites the AST of the file to support a "dict unpacking" syntax.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this might come off as eccentric, I'll make a more realistic test, a typo.

b'{foo} = {"foo": "bar"}\n')

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not even valid syntax.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While we are here, it would be nice to add also test for non-text encoding, like "hex_codec" or "rot13" (or both).

self.check_script_error(src, br"unknown encoding: dict-unpacking-at-home")

def test_coding_spec_decode_error(self):
src = (b'# coding: shift-jis\n'
b'print("\xc4\x85")\n')
self.check_script_error(src, br"'shift_jis' codec can't decode byte")


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix direct execution of files with invalid source encodings to report the
underlying codec lookup or decoding error instead of the generic
``SyntaxError: encoding problem`` message. Patch by Bartosz Sławecki.
5 changes: 3 additions & 2 deletions Parser/pegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include "lexer/lexer.h"
#include "tokenizer/tokenizer.h"
#include "tokenizer/helpers.h"
#include "pegen.h"

// Internal parser functions
Expand Down Expand Up @@ -993,7 +994,7 @@ _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filena
struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
if (tok == NULL) {
if (PyErr_Occurred()) {
_PyPegen_raise_tokenizer_init_error(filename_ob);
_PyTokenizer_raise_init_error(filename_ob);
return NULL;
}
return NULL;
Expand Down Expand Up @@ -1051,7 +1052,7 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
}
if (tok == NULL) {
if (PyErr_Occurred()) {
_PyPegen_raise_tokenizer_init_error(filename_ob);
_PyTokenizer_raise_init_error(filename_ob);
}
return NULL;
}
Expand Down
1 change: 0 additions & 1 deletion Parser/pegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,6 @@ typedef enum {
} TARGETS_TYPE;

int _Pypegen_raise_decode_error(Parser *p);
void _PyPegen_raise_tokenizer_init_error(PyObject *filename);
int _Pypegen_tokenizer_error(Parser *p);
void *_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...);
void *_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
Expand Down
47 changes: 0 additions & 47 deletions Parser/pegen_errors.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,53 +10,6 @@

// TOKENIZER ERRORS

void
_PyPegen_raise_tokenizer_init_error(PyObject *filename)
{
if (!(PyErr_ExceptionMatches(PyExc_LookupError)
|| PyErr_ExceptionMatches(PyExc_SyntaxError)
|| PyErr_ExceptionMatches(PyExc_ValueError)
|| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
return;
}
PyObject *errstr = NULL;
PyObject *tuple = NULL;
PyObject *type;
PyObject *value;
PyObject *tback;
PyErr_Fetch(&type, &value, &tback);
if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
goto error;
}
PyErr_Restore(type, value, tback);
return;
}
errstr = PyObject_Str(value);
if (!errstr) {
goto error;
}

PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
if (!tmp) {
goto error;
}

tuple = _PyTuple_FromPair(errstr, tmp);
Py_DECREF(tmp);
if (!tuple) {
goto error;
}
PyErr_SetObject(PyExc_SyntaxError, tuple);

error:
Py_XDECREF(type);
Py_XDECREF(value);
Py_XDECREF(tback);
Py_XDECREF(errstr);
Py_XDECREF(tuple);
}

static inline void
raise_unclosed_parentheses_error(Parser *p) {
int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
Expand Down
51 changes: 50 additions & 1 deletion Parser/tokenizer/helpers.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include "Python.h"
#include "errcode.h"
#include "pycore_runtime.h" // _Py_ID()
#include "pycore_token.h"
#include "pycore_tuple.h" // _PyTuple_FromPair

#include "../lexer/state.h"

Expand Down Expand Up @@ -149,6 +151,53 @@ _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_inval
return 0;
}

void
_PyTokenizer_raise_init_error(PyObject *filename)
{
if (!(PyErr_ExceptionMatches(PyExc_LookupError)
|| PyErr_ExceptionMatches(PyExc_SyntaxError)
|| PyErr_ExceptionMatches(PyExc_ValueError)
|| PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
return;
}
PyObject *errstr = NULL;
PyObject *tuple = NULL;
PyObject *type;
PyObject *value;
PyObject *tback;
PyErr_Fetch(&type, &value, &tback);
if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
goto error;
}
PyErr_Restore(type, value, tback);
return;
}
errstr = PyObject_Str(value);
if (!errstr) {
goto error;
}

PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
if (!tmp) {
goto error;
}

tuple = _PyTuple_FromPair(errstr, tmp);
Py_DECREF(tmp);
if (!tuple) {
goto error;
}
PyErr_SetObject(PyExc_SyntaxError, tuple);

error:
Py_XDECREF(type);
Py_XDECREF(value);
Py_XDECREF(tback);
Py_XDECREF(errstr);
Py_XDECREF(tuple);
}

int
_PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)
{
Expand Down Expand Up @@ -418,8 +467,8 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta
if (tok->encoding == NULL) {
assert(tok->decoding_readline == NULL);
if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
_PyTokenizer_raise_init_error(tok->filename);
_PyTokenizer_error_ret(tok);
PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
PyMem_Free(cs);
return 0;
}
Expand Down
1 change: 1 addition & 0 deletions Parser/tokenizer/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ int _PyTokenizer_indenterror(struct tok_state *tok);
int _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char);
int _PyTokenizer_parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...);
char *_PyTokenizer_error_ret(struct tok_state *tok);
void _PyTokenizer_raise_init_error(PyObject *filename);

char *_PyTokenizer_new_string(const char *s, Py_ssize_t len, struct tok_state *tok);
char *_PyTokenizer_translate_newlines(const char *s, int exec_input, int preserve_crlf, struct tok_state *tok);
Expand Down
Loading