I read before that dealing with UTF8 was non-trivial with Ada before but was never concerned, and honestly I only needed to write a puny rightward arrow… But pardon my French, holy ßðit !!!
So I only had to write three line using Wide_wide_String and Put from the related package. That’s all. The symbol was displayed, and only è wasn’t: Put_line from Wide_Wide_Text_IO ALWAYS mangles that accentuated character. Text_IO doesn’t.
I tried -gnatW8, I tried pragma Wide_Wide_Encoding(UTF8), nothing works.
See the code:
pragma Extensions_Allowed(All);
pragma Wide_Character_Encoding(UTF8);
with Ada.Text_IO; use Ada.Text_IO;
with Ada.Wide_Wide_Text_IO;
with Ada.Characters.Conversions;
use Ada.Characters.Conversions;
procedure genseq is
package TIO renames Ada.Wide_Wide_Text_IO;
subtype WWString is Wide_Wide_String;
subtype WWCharacter is Wide_Wide_Character;
function V(S: String) return String is ('[' & S & "](V)") with Inline;
function R(S: String) return String is ('[' & S & "](R)") with Inline;
function V(S: Character) return String is (V(String'(1=>S)));
function R(S: Character) return String is (R(String'(1=>S)));
function TWWS (S:String) return Wide_Wide_String renames Ada.Characters.Conversions.To_Wide_Wide_String;
function TWWC (S:Character) return Wide_Wide_Character renames Ada.Characters.Conversions.To_Wide_Wide_Character;
procedure NVVV (S:String) is
A renames S(1);
B renames S(2);
C renames S(3);
function AABC return String is (A&V(S)) with Inline;
function ABBC return String is (R(A)&B&V(B&C)) with Inline;
function ABCC return String is (V(A)&R(B)&C&V(C)) with Inline;
begin
TIO.Put("{{< ne >}}" & TWWS(String'(1=>A)) & "→" & TWWS(V(B)) & "→" & TWWS(R(C)));
for Ind in 1..3 loop
Put (" " & AABC);
Put (" " & ABBC);
Put (" " & ABCC);
end loop;
Put_Line (" " & V(S) & " " & V(S) & " " & V(A&B) & C & "{{</ ne >}}");
end NVVV;
procedure NNVR(S:String) is
A renames S(1);
B renames S(2);
C renames S(3);
function AABC return String is (A&A&V(B)&R(C)) with Inline;
function ABBC return String is (A&B&V(B)&R(C)) with Inline;
function ABCC return String is (A&R(B)&C&R(C)) with Inline;
begin
TIO.Put("{{< ne >}}" & TWWS(String'(1 => A)) & "→" & TWWS(V(B)) & "→" & TWWS(R(C)));
for Ind in 1..3 loop
Put (" " & AABC);
Put (" " & ABBC);
Put (" " & ABCC);
end loop;
Put_Line (" " & A & V(B) & R(C) & " " & A & V(B) & R(C) & " " & A & V(B) & C & "{{</ ne >}}");
end NNVR;
procedure NVRN(S:String) is
A renames S(1);
B renames S(2);
C renames S(3);
function AABC return String is (A&V(A)&R(B)&C) with Inline;
function ABBC return String is (R(A)&B&R(B)&C) with Inline;
function ABCC return String is (A&R(B)&C&R(C)) with Inline;
begin
TIO.put(TWWC(A) & "→" & TWWS(R(B)) & "→" & TWWC(C));
for Ind in 1..3 loop
Put (" " & AABC);
Put (" " & ABBC);
Put (" " & ABCC);
end loop;
for Ind in 1..3 loop
Put (" " & V(A) & R(B) & C);
end loop;
Put_line("{{</ ne >}}");
end NVRN;
procedure NRNV(S:String) is
A renames S(1);
B renames S(2);
C renames S(3);
function AABC return String is (A&R(A)&B&V(C)) with Inline;
function ABBC return String is (R(A)&B&B&V(C)) with Inline;
function ABCC return String is (R(A)&B&C&V(C)) with Inline;
begin
TIO.put("{{< ne >}}" & TWWS(R(A)) & "→" & TWWC(B) & "→" & TWWS(R(C)));
for Ind in 1..3 loop
Put (" " & AABC);
Put (" " & ABBC);
Put (" " & ABCC);
end loop;
Put_line (" " & R(A) & B & V(C) & " " & R(A) & B & V(C) & " " & R(A) & B & C & "{{</ ne >}}");
end NRNV;
procedure NNVV(S:String) is
A renames S(1);
B renames S(2);
C renames S(3);
function AABC return String is (A&A&V(B&C)) with Inline;
function ABBC return String is (A&R(B)&B&V(C)) with Inline;
function ABCC return String is (V(A&B)&R(C)&C) with Inline;
begin
TIO.put(TWWC(A) & "→" & TWWC(B) & "→" & TWWC(C));
for Ind in 1..3 loop
Put (" " & AABC);
Put (" " & ABBC);
Put (" " & ABCC);
end loop;
Put_line (" " & V(S) & " " & V(S) & " " & V(A&B) & A & "{{</ ne >}}");
end NNVV;
procedure Normal (S:String) is
A renames S(1);
B renames S(2);
C renames S(3);
function AABC return String is ('*'&A&'*'&A&B&C) with Inline;
function ABBC return String is ('*'&A&'*'&B&B&C) with Inline;
function ABCC return String is ('*'&A&'*'&B&C&C) with Inline;
begin
TIO.Put("{{< ne >}}" & TWWC(A) & "→" & TWWC(B) & "→" & TWWC(C));
for Ind in 1..3 loop
Put(" " & AABC & " " & ABBC & " " & ABCC);
end loop;
for Ind in 1..3 loop
Put(" " & '*' & A & '*' & B & C);
end loop;
Put_line("{{</ ne >}}");
end Normal;
WithNormal: Boolean := Boolean'Value(Get_Line);
begin
Skip_Line;
while not End_Of_File loop
declare
IncorrectDataLength: exception;
Choice: String := Get_Line;
Data: String := Get_Line;
begin
raise IncorrectDataLength when Data'Length not in 3..4;
if WithNormal then Normal(Data); end if;
case Positive'Value(Choice) is
when 1 => NVVV(Data);
when 2 => NNVR(Data);
when 3 => NVRN(Data);
when 4 => NRNV(Data);
when 5 => NNVV(Data);
when others => raise Constraint_Error;
end case;
exception
when Constraint_Error => Put_line(Current_Error, Choice & "line " & Line'Image & ": Invaid pattern"); exit;
when IncorrectDataLength => Put_line(Current_Error, Data & "line " & Line'Image & ": Invalid Data length (" & Data'Length'Image & ')'); exit;
when others => Put_line(Current_Error, "Lacks a pattern or data"); exit;
end;
end loop;
exception
when End_Error => Put_line(Current_Error, "A line is missing");
end genseq;
try with this (standard) input file:
False
1
mèb
my terminal is top-notch and it’s è we’re talking about, not some obscure complex unicode glyph.
I noticed something similar for something I was working on. Out of curiosity, try piping the output into cat and see if it still displays wrong? I found that my terminal couldn’t handle it natively, but some programs could display it properly. Make sure that you still have -gnatW8 enabled for this test.
UTF-8 with GNAT is very simple if you understand how it works. In essence the -gnatW8 switch corrupts all I/O subsystem and string stream attributes. In effect UTF-8 would be decoded into UCS-2/4 and output would be encoded into UTF-8. Note that corruption is done on the run-time level. It means that the -gnatW8 takes precedence over the code compiled before without it! This means that a tested library can suddenly behave in a completely different way!
Consider this sample program to understand how -gnatW8 works:
with Ada.Text_IO;
with Ada.Wide_Wide_Text_IO;
procedure Main is
E_With_Grave : constant := 16#E8#;
begin
Ada.Text_IO.Put_Line ("Text_IO " & Character'Val (E_With_Grave));
Ada.Wide_Wide_Text_IO.Put_Line ("Wide_Wide_Text_IO " & Wide_Wide_Character'Val (E_With_Grave));
end Main;
If you build it without -gnatW8. It will output Latin-1 as specified and you will see this
Text_IO ▒
Wide_Wide_Text_IO ▒
on a UTF-8 console. I.e. it will not touch anything. If you build with -gnatW8, both will be encoded to UTF-8:
Text_IO è
Wide_Wide_Text_IO è
My recommendation is even simpler:
Never use any Wide strings and I/O;
Never use -gnatW8. Make sure that your compiler does not have it as the default;
Assume String UTF-8 encoded;
As a consequence you will lose Unicode literals and identifiers, which is no loss at all IMO.
… I was wrong again.
There is this variable S, in the code above.
Outputting it as whole is fine: Put(S). Put(1..3) is fine too.
But Put((S(1)) or Put(S(2)), it’s mangled.
What kind of bug is that ?!
And worse every time I change the code, I can’t single it out.
I’ve had enough of this… I hope someone compile the example and figure it out.
It is a bug by design. When reading Unicode into String you have a bad and worse choices. Let you attempt to decode it then beyond Latin-1 all input will suddenly become illegal and you will have to raise Data_Error which nobody will ever accept. So -gnatW8 encodes string output but not decodes it back.
No, -gnatW8 consistently corrupts both singular characters and strings. At least GNAT 13.3.0 does so.
Ok, is there a good Ada solution that doesn’t involve breaking the knee and using outside libraries ? Sorry to say this, but what have the developers/designers/I-don’t-give-a-damn been doing ? I can read and output most of Unicode in both my browsers, terminal, text edtior, etc, all written either in C, C++, Rust or Go. But Ada (or it’s environment, not my problem) does that ?
Those are rhetorical questions, Do not feel obliged to respond.
You should stay away from using anything but Latin_1 in Ada.
Time is wasted on dealing with the mess of standard library string types.
Don’t use anything with Wide_* in name, don’t even use (Un-)bounded_Strings in Ada.
If you feel the need to, either think harder and solve with String and Latin_1 or change the language until Ada manages to tackle this.
For real ? I drop Ada for months and the moment I find pleasure to use it practically on things I can handle, I find the one sore point ugly to warrant this advice ?
For a damn è ?!??
This is nuts.
Isn’t there a way to read è characters correctly and consistently ? I can avoid crazy emojis, but I’m French, putain de merde. I can’t output my own language now !
I also had a lot of problems with Adas String handling and it took me some time to get used to it. I’m still not a huge fan but I think that’s just because of the way strong typing is. It feels kinda wrong having to cast from Unbounded to Bounded and back and more.
You would and you should use a library that handles these strings for you. I haven’t used them but this resource should guide you more than I can do.
You’re being a bit dramatic. This is an old language.
I don’t know why some Latin_1 are not printable (they should).
In any case, Ada lacks a proper abstraction for the “string”.
Use ‘e’ until situation improves.
It’s because you shouldn’t do it.
It’s perfectly fine for an old language to have sections in the standard library that you should not touch.
And you shouldn’t touch Wide_* and (Un)bounded_Strings because they are an unergonomic mess.
It is very likely not Ada the language, but rather (a) GNAT mangling things, or (b) your terminal.
Assuming you’re using Windows, for the terminal:
Ensure you are using the correct codepage: chcp 65001.
Ensure the terminal is using a unicode font, right-click the title-bar or system-menu, click Properties, and select “Lucida Console” (I think “Consolas” works, too, but I might be misremembering.
You can verify correct transmission via GPS’s “Run” tab; this compensates for GNAT’s… odd runtime.
As for GNAT, I never use -gnatW8, at least not directly. (Using GPS.) Instead:
Right-click your source, select Properties, set Character Set to “Unicode UTF-8”.
Put Pragma Wide_Character_Encoding( UTF8 ); at the top of the source.
I disagree, the standard has since Ada 2012 defined Identifiers in terms of Unicode.
This is a GNAT problem, not an Ada problem.
If you find the situation intolerable, consider contributing to another Ada implementation than GNAT. There’s my Byron & BAATS compilers, the HAC compiler, and a few new ones. — The presence of a viable non-GNAT open source Ada will change how willing people are to jump in on GNAT… and it will help keep stupid stuff out of the language, like AdaCore’s disgusting proposal for a Class construct/syntax-sugar for tagged types.
I use voidlinux, terminal Kitty 0.45 and fr_FR.UTF-8 as a local. Also I tested with libreoffice too. Terminal’s innocent. I’ll participate to your project to my capacity.
I would disagree, but then again, I use Wide_Wide_Character exclusively in Byron precisely so that I don’t have to screw about with the various encodings (UTF8, UTF16BE, UTF16LE, etc), all of the forms are handled in the reader, converted to Wide_Wide_Character, and used as a singular/standardized input.
Again, in processing, using Wide_Wide_Character allows me to (a) enforce correctness, by directly defining to the standard using Unicode/Wide_Wide_Character, and (b) allows for safety against malicious source-code attacks, as shown here where the subtype defines the Emittable subset of tokens, thereby excluding all nonprocessed (ie plain-text) tokens, which forces all “texty” nodes (Identifiers, Comments, & Literals) to pass their construction/validation tests.
There’s a lot of criticism about Ada’s various String-types, and it could/should be handled better (IMO, via generic-instantiation, thus collapsing the codebase into mere instances). BUT, there is a place for the various types:
Fixed: A simple, fixed length.
Bounded: Maps exactly onto DB Strings.
Unbounded: A general, varying-length, particularly for dynamic+in-place [textual] processing.
Were the ARG to consolidate the various children of Ada.Strings via generic as previously suggested, there would be an added benefit: [near] direct usage of CS linguistics papers, and thereby ease implementation difficulties. (ie consider the ability to be able to search-and-replace on a vector of tokens.)
There are clearly uses for all the different String and Character versions.
An experienced Ada programmer will not have much trouble dealing with them.
I too use those things in some code I’ve written, mostly for convenience when more robust solution is not worth the time.
Judging the situation from the outside is another story.
These types are objectively bad abstractions for the most generic idea of a “string”.
Their design is antiquated, they are confusing to beginners, they don’t work together, and, worst of all, these standard library types hurt the readability.
Actually, they don’t even work as is more evident each time someone asks about it.
These packages really should not exist in a high-level programming language and I cannot in good conscience tell somebody that this is a manageable situation.
To me it’s more pragmatic to opt out, so I stick to Strings, that is arrays of bytes, because at least they are simple, efficient and readable, and actually really effective thanks to compiler magics.
All of this, is of course nobody’s fault, Ada is just an old language.
And no, it didn’t last. Same setting, doesn’t work twice. No wonder a good many programmers think Ada is a laughing matter, if not an already extinct language. Even if the responsability falls onto the implementers.
I could be completely off-base here, and it wouldn’t surprise me if the regular Ada programmers correct or even reject some of the advice I give below, but I’ve gotten UTF-8 and UTF-16 to work more than once in Ada; I think I’ve gotten your code to work (see below); and I’d really like you to take the following suggestions to heart.
Try to provide a minimal working example. (Perhaps minimal “non-working” example is more appropriate in this case, but you get the idea.)
Make your code readable. Consider consulting the Ada Style Guide (elsewhere on this site) on naming. In particular:
You’ve used an awful lot of acronyms and shortcuts that make it hard to determine what you’re trying to do – and I’d bet that it makes it hard not only for us, but also for you – even if you don’t realize it. It’s not reasonable to ask others for help and then provide them with a large-ish program filled with cryptic acronyms.
For example, what is NNVR supposed to mean? I have no idea from its name what that subprogram is supposed to do. I probably could glean an idea, were I to dig into the code, but your readerse have only so much time, and if you want to encourage them to help you, you need to minimize the energy they’ll spend trying to understand what you’re doing.
It’s okay if NNVR expands to some French terms; some of us read French and we all have access to Yandex Translate (or Google Translate or whatever).
GNAT emitted a lot of warnings on your code. Nearly all were due to using S (1) instead of S (S'First), S (2) instead of S (S'First + 1), etc. You should heed those warnings rather than silence or ignore them. (Trust me: they’ve come around and bitten me in the tushie.)
Perhaps the one of most interest to you: If you want to input/output UTF-8, and you’re starting with fresh code, don’t use Ada.Text_IO, String, and Character. Use Ada.Wide_Wide_Text_IO, Wide_Wide_String, and Wide_Wide_Character, and work with those as much as possible.
That last bullet may irritate a lot of readers, but I applied it to your code and… it just worked. I’ll paste the result below. It works consistently for me, while the code you pasted fails consistently, so it should work consistently for you.
But I’m not that much of an expert on this, so if it doesn’t work, then I apologize.
(I also apologize for the changes to the formatting; the ada-lang plugin in VSCode does that for me, and I’m not turning that off, even though I don’t always agree with its decisions.)
pragma Ada_2022;
pragma Wide_Character_Encoding (UTF8);
with Ada.Wide_Wide_Text_IO; use Ada.Wide_Wide_Text_IO;
with Ada.Characters.Conversions; use Ada.Characters.Conversions;
procedure genseq is
package TIO renames Ada.Wide_Wide_Text_IO;
subtype WWString is Wide_Wide_String;
subtype WWCharacter is Wide_Wide_Character;
function V (S : WWString) return WWString
is ('[' & S & "](V)")
with Inline;
function R (S : WWString) return WWString
is ('[' & S & "](R)")
with Inline;
function V (S : WWCharacter) return WWString
is (V (WWString'(1 => S)));
function R (S : WWCharacter) return WWString
is (R (WWString'(1 => S)));
procedure NVVV (S : WWString) is
A renames S (S'First);
B renames S (S'First + 1);
C renames S (S'First + 2);
function AABC return WWString
is (A & V (S))
with Inline;
function ABBC return WWString
is (R (A) & B & V ([1 => B, 2 => C]))
with Inline;
function ABCC return WWString
is (V (A) & R (B) & C & V (C))
with Inline;
begin
TIO.Put ("{{< ne >}}" & WWString'(1 => A) & "→" & V (B) & "→" & R (C));
for Ind in 1 .. 3 loop
Put (" " & AABC);
Put (" " & ABBC);
Put (" " & ABCC);
end loop;
Put_Line
(" " & V (S) & " " & V (S) & " " & V (A & B) & C & "{{</ ne >}}");
end NVVV;
procedure NNVR (S : WWString) is
A renames S (S'First);
B renames S (S'First + 1);
C renames S (S'First + 2);
function AABC return WWString
is (A & A & V (B) & R (C))
with Inline;
function ABBC return WWString
is (A & B & V (B) & R (C))
with Inline;
function ABCC return WWString
is (A & R (B) & C & R (C))
with Inline;
begin
TIO.Put ("{{< ne >}}" & WWString'(1 => A) & "→" & V (B) & "→" & R (C));
for Ind in 1 .. 3 loop
Put (" " & AABC);
Put (" " & ABBC);
Put (" " & ABCC);
end loop;
Put_Line
(" "
& A
& V (B)
& R (C)
& " "
& A
& V (B)
& R (C)
& " "
& A
& V (B)
& C
& "{{</ ne >}}");
end NNVR;
procedure NVRN (S : WWString) is
A renames S (S'First);
B renames S (S'First + 1);
C renames S (S'First + 2);
function AABC return WWString
is (A & V (A) & R (B) & C)
with Inline;
function ABBC return WWString
is (R (A) & B & R (B) & C)
with Inline;
function ABCC return WWString
is (A & R (B) & C & R (C))
with Inline;
begin
TIO.put (A & "→" & R (B) & "→" & C);
for Ind in 1 .. 3 loop
Put (" " & AABC);
Put (" " & ABBC);
Put (" " & ABCC);
end loop;
for Ind in 1 .. 3 loop
Put (" " & V (A) & R (B) & C);
end loop;
Put_line ("{{</ ne >}}");
end NVRN;
procedure NRNV (S : WWString) is
A renames S (S'First);
B renames S (S'First + 1);
C renames S (S'First + 2);
function AABC return WWString
is (A & R (A) & B & V (C))
with Inline;
function ABBC return WWString
is (R (A) & B & B & V (C))
with Inline;
function ABCC return WWString
is (R (A) & B & C & V (C))
with Inline;
begin
TIO.put ("{{< ne >}}" & R (A) & "→" & B & "→" & R (C));
for Ind in 1 .. 3 loop
Put (" " & AABC);
Put (" " & ABBC);
Put (" " & ABCC);
end loop;
Put_line
(" "
& R (A)
& B
& V (C)
& " "
& R (A)
& B
& V (C)
& " "
& R (A)
& B
& C
& "{{</ ne >}}");
end NRNV;
procedure NNVV (S : WWString) is
A renames S (S'First);
B renames S (S'First + 1);
C renames S (S'First + 2);
function AABC return WWString
is (A & A & V (B & C))
with Inline;
function ABBC return WWString
is (A & R (B) & B & V (C))
with Inline;
function ABCC return WWString
is (V (A & B) & R (C) & C)
with Inline;
begin
TIO.put (A & "→" & B & "→" & C);
for Ind in 1 .. 3 loop
Put (" " & AABC);
Put (" " & ABBC);
Put (" " & ABCC);
end loop;
Put_line
(" " & V (S) & " " & V (S) & " " & V (A & B) & A & "{{</ ne >}}");
end NNVV;
procedure Normal (S : WWString) is
A renames S (S'First);
B renames S (S'First + 1);
C renames S (S'First + 2);
function AABC return WWString
is ('*' & A & '*' & A & B & C)
with Inline;
function ABBC return WWString
is ('*' & A & '*' & B & B & C)
with Inline;
function ABCC return WWString
is ('*' & A & '*' & B & C & C)
with Inline;
begin
TIO.Put ("{{< ne >}}" & A & "→" & B & "→" & C);
for Ind in 1 .. 3 loop
Put (" " & AABC & " " & ABBC & " " & ABCC);
end loop;
for Ind in 1 .. 3 loop
Put (" " & '*' & A & '*' & B & C);
end loop;
Put_line ("{{</ ne >}}");
end Normal;
WithNormal : Boolean :=
Boolean'Value (Ada.Characters.Conversions.To_String (Get_Line));
begin
Ada.Wide_Wide_Text_IO.Skip_Line;
while not Ada.Wide_Wide_Text_IO.End_Of_File loop
declare
IncorrectDataLength : exception;
Choice : WWString := Get_Line;
Data : WWString := Get_Line;
begin
if Data'Length not in 3 .. 4 then
raise IncorrectDataLength;
end if;
if WithNormal then
Normal (Data);
end if;
case Positive'Value (Ada.Characters.Conversions.To_String (Choice)) is
when 1 =>
NVVV (Data);
when 2 =>
NNVR (Data);
when 3 =>
NVRN (Data);
when 4 =>
NRNV (Data);
when 5 =>
NNVV (Data);
when others =>
raise Constraint_Error;
end case;
exception
when Constraint_Error =>
Put_line
(Current_Error,
Choice
& "line "
& Ada.Characters.Conversions.To_Wide_Wide_String (Line'Image)
& ": Invaid pattern");
exit;
when IncorrectDataLength =>
Put_line
(Current_Error,
Data
& "line "
& Ada.Characters.Conversions.To_Wide_Wide_String (Line'Image)
& ": Invalid Data length ("
& Ada.Characters.Conversions.To_Wide_Wide_String
(Data'Length'Image)
& ')');
exit;
when others =>
Put_line (Current_Error, "Lacks a pattern or data");
exit;
end;
end loop;
exception
when End_Error =>
Put_line (Current_Error, "A line is missing");
end genseq;