Skip to content

Commit 711008a

Browse files
lukaszsamsonjosevalim
authored andcommitted
Consistently raise UnicodeConversionError in tokenizer (#14589)
1 parent 7a28203 commit 711008a

File tree

2 files changed

+51
-6
lines changed

2 files changed

+51
-6
lines changed

lib/elixir/src/elixir_tokenizer.erl

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,18 +1032,42 @@ unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{existing_atoms_only=true}
10321032
try
10331033
{ok, binary_to_existing_atom(Binary, utf8)}
10341034
catch
1035-
error:badarg -> {error, {?LOC(Line, Column), "unsafe atom does not exist: ", elixir_utils:characters_to_list(Binary)}}
1035+
error:badarg ->
1036+
% Check if it's a UTF-8 issue by trying to convert to list
1037+
elixir_utils:characters_to_list(Binary),
1038+
% If we get here, it's not a UTF-8 issue
1039+
{error, {?LOC(Line, Column), "unsafe atom does not exist: ", elixir_utils:characters_to_list(Binary)}}
1040+
end;
1041+
unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{}) when is_binary(Binary) ->
1042+
try
1043+
{ok, binary_to_atom(Binary, utf8)}
1044+
catch
1045+
error:badarg ->
1046+
% Try to convert using elixir_utils to get proper UnicodeConversionError
1047+
elixir_utils:characters_to_list(Binary),
1048+
% If we get here, it's not a UTF-8 issue, so it's some other badarg
1049+
{error, {?LOC(Line, Column), "invalid atom: ", elixir_utils:characters_to_list(Binary)}}
10361050
end;
1037-
unsafe_to_atom(Binary, _Line, _Column, #elixir_tokenizer{}) when is_binary(Binary) ->
1038-
{ok, binary_to_atom(Binary, utf8)};
10391051
unsafe_to_atom(List, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_list(List) ->
10401052
try
10411053
{ok, list_to_existing_atom(List)}
10421054
catch
1043-
error:badarg -> {error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}}
1055+
error:badarg ->
1056+
% Try to convert using elixir_utils to get proper UnicodeConversionError
1057+
elixir_utils:characters_to_binary(List),
1058+
% If we get here, it's not a UTF-8 issue
1059+
{error, {?LOC(Line, Column), "unsafe atom does not exist: ", List}}
10441060
end;
1045-
unsafe_to_atom(List, _Line, _Column, #elixir_tokenizer{}) when is_list(List) ->
1046-
{ok, list_to_atom(List)}.
1061+
unsafe_to_atom(List, Line, Column, #elixir_tokenizer{}) when is_list(List) ->
1062+
try
1063+
{ok, list_to_atom(List)}
1064+
catch
1065+
error:badarg ->
1066+
% Try to convert using elixir_utils to get proper UnicodeConversionError
1067+
elixir_utils:characters_to_binary(List),
1068+
% If we get here, it's not a UTF-8 issue, so it's some other badarg
1069+
{error, {?LOC(Line, Column), "invalid atom: ", List}}
1070+
end.
10471071

10481072
collect_modifiers([H | T], Buffer) when ?is_downcase(H) or ?is_upcase(H) or ?is_digit(H) ->
10491073
collect_modifiers(T, [H | Buffer]);

lib/elixir/test/elixir/code_test.exs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,27 @@ defmodule CodeTest do
514514
}
515515
end
516516

517+
test "string_to_quoted raises UnicodeConversionError for invalid UTF-8 in quoted atoms and function calls" do
518+
invalid_utf8_cases = [
519+
# Quoted atom
520+
~S{:"\xFF"},
521+
~S{:'\xFF'},
522+
# Quoted function call
523+
~S{foo."\xFF"()},
524+
~S{foo.'\xFF'()}
525+
]
526+
527+
for code <- invalid_utf8_cases do
528+
assert_raise UnicodeConversionError, fn ->
529+
Code.string_to_quoted!(code)
530+
end
531+
532+
assert_raise UnicodeConversionError, fn ->
533+
Code.string_to_quoted!(code, existing_atoms_only: true)
534+
end
535+
end
536+
end
537+
517538
@tag :requires_source
518539
test "compile source" do
519540
assert __MODULE__.__info__(:compile)[:source] == String.to_charlist(__ENV__.file)

0 commit comments

Comments
 (0)