/** * Reads a HTML file, identifies blocks of C code contained within `
`
* tags, and applies syntax highlighting in the form of `` tags to those
* blocks of C code.
*
* Any pre-existing syntax highlighting is stripped from the C code before
* applying new syntax highlighting so that the process is idempotent. This is
* done for a few reasons:
*
* 1) Only one HTML file is required, rather than needing both an unprocessed
* HTML and a post-processed HTML file.
*
* 2) That single HTML file can be served in the future as-is, even if this tool
* were to disapear or break.
*
* 3) If the tool is updated in the future to apply different syntax
* highlighting, that single HTML file can simply be run through the tool once
* more in order to update it.
*/
#include
#include
#include
#include
struct buffered_stream {
FILE *stream;
char *buffer;
size_t capacity;
size_t available;
size_t index;
};
size_t min(size_t x, size_t y) { return x < y ? x : y; }
// Returns the actual number of bytes available (may be less than len).
size_t peek(struct buffered_stream *bs, size_t len, char **result) {
len = min(len, bs->capacity);
if (bs->available < len) {
memmove(bs->buffer, bs->buffer + bs->index, bs->available);
size_t read = fread(bs->buffer + bs->available, sizeof(*bs->buffer),
bs->capacity - bs->available, bs->stream);
bs->available += read;
bs->index = 0;
}
size_t peeked = min(len, bs->available);
*result = bs->buffer + bs->index;
return peeked;
}
// Returns the actual number of bytes dropped (may be less than len).
size_t drop(struct buffered_stream *bs, size_t len) {
len = min(len, bs->capacity);
if (bs->available < len) {
memmove(bs->buffer, bs->buffer + bs->index, bs->available);
size_t read = fread(bs->buffer + bs->available, sizeof(*bs->buffer),
bs->capacity - bs->available, bs->stream);
bs->available += read;
bs->index = 0;
}
size_t dropped = min(len, bs->available);
bs->index += dropped;
bs->available -= dropped;
return dropped;
}
// TODO: Stripping of pre-existing syntax highlighting may be better done as a
// layer in the peek/drop functions themselves, in order to simplify multiple
// other functions which need to consider pre-existing syntax highlighting
void handle_pre_existing_syntax(struct buffered_stream *bs, FILE *out) {
while (true) {
char *buffer;
if (!peek(bs, 1, &buffer)) {
break;
} else if (*buffer == '>') {
drop(bs, 1);
break;
} else {
drop(bs, 1);
}
}
}
// TODO: For multi-line comments made up of multiple single-line comments, use a
// single span block rather than multiple
void handle_single_line_comment(struct buffered_stream *bs, FILE *out) {
fputs("//", out);
while (true) {
char *buffer;
size_t available = peek(bs, 7, &buffer);
if (!available) {
fputs("", out);
break;
} else if (available >= 7 && !memcmp(buffer, " ", 7)) {
// Normally stripping of any previously existing syntax would occur
// in the parent function, but since single-line comments swallow
// until newline, handling needs to be duplicated here
drop(bs, 7);
} else if (*buffer == '\n') {
fputs("\n", out);
drop(bs, 1);
break;
} else {
putc(*buffer, out);
drop(bs, 1);
}
}
}
void handle_multi_line_comment(struct buffered_stream *bs, FILE *out) {
fputs("/*", out);
while (true) {
char *buffer;
size_t available = peek(bs, 2, &buffer);
if (!available) {
fputs("", out);
break;
} else if (available >= 2 && !memcmp(buffer, "*/", 2)) {
fputs("*/", out);
drop(bs, 2);
break;
} else {
putc(*buffer, out);
drop(bs, 1);
}
}
}
void handle_string_literal(struct buffered_stream *bs, FILE *out) {
fputs("\"", out);
while (true) {
char *buffer;
size_t available = peek(bs, 2, &buffer);
if (!available) {
fputs("", out);
break;
} else if (*buffer == '"') {
fputs("\"", out);
drop(bs, 1);
break;
} else if (available >= 2 && !memcmp(buffer, "\\\"", 2)) {
fwrite(buffer, sizeof(*buffer), 2, out);
drop(bs, 2);
} else {
putc(*buffer, out);
drop(bs, 1);
}
}
}
void handle_character_literal(struct buffered_stream *bs, FILE *out) {
fputs("'", out);
while (true) {
char *buffer;
size_t available = peek(bs, 2, &buffer);
if (!available) {
fputs("", out);
break;
} else if (*buffer == '\'') {
fputs("'", out);
drop(bs, 1);
break;
} else if (available >= 2 && !memcmp(buffer, "\\'", 2)) {
fwrite(buffer, sizeof(*buffer), 2, out);
drop(bs, 2);
} else {
putc(*buffer, out);
drop(bs, 1);
}
}
}
void handle_identifier(struct buffered_stream *bs, FILE *out) {
bool is_function_call = false;
char *buffer;
size_t len = 1;
while (true) {
size_t available = peek(bs, len + 8, &buffer);
if (available < len + 1) {
break;
} else if (isalnum(buffer[len]) || buffer[len] == '_') {
len += 1;
} else if (buffer[len] == '(') {
is_function_call = true;
break;
} else if ((available - len) >= 8 &&
!memcmp(buffer + len, "(", 8)) {
// Normally stripping of any previously existing syntax would occur
// in the parent function, but since function calls don't include
// the following parentheses, handling needs to be duplicated here
is_function_call = true;
break;
} else {
break;
}
}
if (is_function_call) {
fputs("", out);
fwrite(buffer, sizeof(*buffer), len, out);
fputs("", out);
} else if ((len >= 4 && !memcmp(buffer, "void", 4)) ||
(len >= 5 && !memcmp(buffer, "const", 5)) ||
(len >= 6 && !memcmp(buffer, "static", 6)) ||
(len >= 6 && !memcmp(buffer, "signed", 6)) ||
(len >= 8 && !memcmp(buffer, "unsigned", 8)) ||
(len >= 4 && !memcmp(buffer, "bool", 4)) ||
(len >= 4 && !memcmp(buffer, "char", 4)) ||
(len >= 5 && !memcmp(buffer, "short", 5)) ||
(len >= 3 && !memcmp(buffer, "int", 3)) ||
(len >= 4 && !memcmp(buffer, "long", 4)) ||
(len >= 5 && !memcmp(buffer, "float", 5)) ||
(len >= 6 && !memcmp(buffer, "double", 6)) ||
(len >= 7 && !memcmp(buffer, "uint8_t", 7)) ||
(len >= 8 && !memcmp(buffer, "uint16_t", 8)) ||
(len >= 8 && !memcmp(buffer, "uint32_t", 8)) ||
(len >= 8 && !memcmp(buffer, "uint64_t", 8)) ||
(len >= 9 && !memcmp(buffer, "uint128_t", 9)) ||
(len >= 6 && !memcmp(buffer, "int8_t", 6)) ||
(len >= 7 && !memcmp(buffer, "int16_t", 7)) ||
(len >= 7 && !memcmp(buffer, "int32_t", 7)) ||
(len >= 7 && !memcmp(buffer, "int64_t", 7)) ||
(len >= 8 && !memcmp(buffer, "int128_t", 8)) ||
(len >= 6 && !memcmp(buffer, "size_t", 6)) ||
(len >= 4 && !memcmp(buffer, "FILE", 4)) ||
(len >= 7 && !memcmp(buffer, "va_list", 7))) {
// TODO: A macro or function which combines the length check and memory
// comparison would reduce duplication and the chance for errors.
fputs("", out);
fwrite(buffer, sizeof(*buffer), len, out);
fputs("", out);
} else {
fwrite(buffer, sizeof(*buffer), len, out);
}
drop(bs, len);
}
void handle_number_literal(struct buffered_stream *bs, FILE *out) {
fputs("", out);
while (true) {
char *buffer;
if (!peek(bs, 1, &buffer)) {
fputs("", out);
break;
} else if (isalnum(*buffer) || *buffer == '.') {
putc(*buffer, out);
drop(bs, 1);
} else {
fputs("", out);
break;
}
}
}
// Highlights C syntax contained within HTML blocks, until the
// corresponding ending /pre>
tags are found. Ignores existing syntax
// highlighting so that this operation is idempotent.
bool highlight_syntax_c(struct buffered_stream *bs, FILE *out) {
char *buffer;
size_t available = peek(bs, 13, &buffer);
if (!available) {
return false;
} else if (available >= 13 && !memcmp(buffer, "
", 13)) {
return false;
} else if (available >= 5 && !memcmp(buffer, "= 7 && !memcmp(buffer, "", 7)) {
drop(bs, 7);
} else if (available >= 2 && !memcmp(buffer, "//", 2)) {
drop(bs, 2);
handle_single_line_comment(bs, out);
} else if (available >= 2 && !memcmp(buffer, "/*", 2)) {
drop(bs, 2);
handle_multi_line_comment(bs, out);
} else if (*buffer == '"') {
drop(bs, 1);
handle_string_literal(bs, out);
} else if (*buffer == '\'') {
drop(bs, 1);
handle_character_literal(bs, out);
} else if (isalpha(*buffer) || *buffer == '_') {
handle_identifier(bs, out);
} else if (isdigit(*buffer)) {
handle_number_literal(bs, out);
} else if (available >= 4 && !memcmp(buffer, "<", 4)) {
// Only '<' and '&' are escaped, as described here:
// https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/pre#escaping_ambiguous_characters
fputs("<", out);
drop(bs, 4);
} else if (available >= 5 && !memcmp(buffer, "&", 5)) {
fputs("&", out);
drop(bs, 5);
} else if (*buffer == '<') {
fputs("<", out);
drop(bs, 1);
} else if (*buffer == '&') {
fputs("&", out);
drop(bs, 1);
} else {
putc(*buffer, out);
drop(bs, 1);
}
return true;
}
// Parses the HTML document to find tags, and then highlights C
// syntax contained within those tags. Nothing in the rest of the HTML document
// should be modified.
bool highlight_syntax_html(struct buffered_stream *bs, FILE *out) {
char *buffer;
size_t available = peek(bs, 11, &buffer);
if (!available) {
return false;
} else if (available >= 11 && !memcmp(buffer, "", 11)) {
fputs("", out);
drop(bs, 11);
while (highlight_syntax_c(bs, out)) {}
} else {
putc(*buffer, out);
drop(bs, 1);
}
return true;
}
void highlight_syntax(FILE *in, FILE *out) {
char buffer[1024];
struct buffered_stream bs = {
.stream = in,
.buffer = buffer,
.capacity = sizeof(buffer) / sizeof(*buffer),
};
while (highlight_syntax_html(&bs, out)) {}
}
int main(void) {
highlight_syntax(stdin, stdout);
return 0;
}