aboutsummaryrefslogtreecommitdiff
path: root/wtf8.c
blob: 6f369a35ff0125cffba3866a3b2cd6c8a90a5adf (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/**
 * Copyright (C) 2016--2019, 2021 Tom Ryder <tom@sanctum.geek.nz>
 *
 * This file is part of wtf8.
 *
 * wtf8 is free software: you can redistribute it and/or modify it under the
 * terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * wtf8 is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * wtf8.  If not, see <https://www.gnu.org/licenses/>.
 */

#include "wtf8.h"

/*
 * Check if first two bits of the character are "10", meaning it'str a UTF-8
 * continuation character
 */
int is_utf8_cont(unsigned char chr) {
	return (chr & 0xC0) == 0x80;
}

/*
 * Print each octet of a string of characters as lowercase hex followed by a
 * trailing space, ending with a newline
 */
void print_octets(FILE *stream, char *str) {
	unsigned char chr;

	/*
	 * Iterate through the string, printing each octet, ending with a newline
	 */
	while ((chr = *str++)) {
		char sep;
		sep = is_utf8_cont(chr)
			? BYTE_SEP
			: CHAR_SEP;
		fprintf(stream, "%c%02x", sep, chr);
	}
	fputc('\n', stream);

	return;
}

/*
 * Print each of the UTF-8 characters to align with the output of
 * print_octets(), with each character in line with the end of the octet that
 * terminates it, ending with a newline
 */
void print_characters(FILE *stream, char *str) {

	/*
	 * Iterate through the string
	 */
	while (*str) {

		/*
		 * We need a short counter to find how long each character is
		 */
		unsigned char chr;

		/*
		 * Print blanks and increment a counter until we find how long this
		 * character is
		 */
		for (chr = 1; is_utf8_cont(str[chr]); chr++) {

			/*
			 * Print blanks
			 */
			fprintf(stream, "   ");

			/*
			 * If we've hit UCHAR_MAX, this is probably a perverse
			 * string of bytes for fuzzing or exploitation; bail
			 * out
			 */
			if (chr == UCHAR_MAX) {
				fprintf(stderr, "Perverse byte count, bailing\n");
				exit(1);
			}
		}

		/*
		 * Print two spaces, and then the full character
		 */
		fprintf(stream, "  ");
		while (chr--)
			fputc(*str++, stream);
	}

	/*
	 * End with a newline
	 */
	fputc('\n', stream);
	return;
}

/*
 * Main function
 */
int main(int argc, char **argv) {

	/*
	 * Check we have one and only one argument
	 */
	if (argc != 2) {
		fprintf(stderr, "%s: Need one argument\n", PROGRAM_NAME);
		exit(EXIT_FAILURE);
	}

	/*
	 * Print the sole argument first as hex octets, then as characters, spaced
	 * accordingly
	 */
	print_octets(stdout, argv[1]);
	print_characters(stdout, argv[1]);

	/*
	 * Done!
	 */
	exit(EXIT_SUCCESS);
}