/** * Reads a HTML file, identifies blocks of C code contained within `
`
 * tags, and applies syntax highlighting in the form of `` tags to those
 * blocks of C code.
 *
 * Any pre-existing syntax highlighting is stripped from the C code before
 * applying new syntax highlighting so that the process is idempotent. This is
 * done for a few reasons:
 *
 * 1) Only one HTML file is required, rather than needing both an unprocessed
 * HTML and a post-processed HTML file.
 *
 * 2) That single HTML file can be served in the future as-is, even if this tool
 * were to disapear or break.
 *
 * 3) If the tool is updated in the future to apply different syntax
 * highlighting, that single HTML file can simply be run through the tool once
 * more in order to update it.
 */

#include 
#include 
#include 
#include 

struct buffered_stream {
    FILE *stream;
    char *buffer;
    size_t capacity;
    size_t available;
    size_t index;
};

size_t min(size_t x, size_t y) { return x < y ? x : y; }

// Returns the actual number of bytes available (may be less than len).
size_t peek(struct buffered_stream *bs, size_t len, char **result) {
    len = min(len, bs->capacity);

    if (bs->available < len) {
        memmove(bs->buffer, bs->buffer + bs->index, bs->available);
        size_t read = fread(bs->buffer + bs->available, sizeof(*bs->buffer),
                            bs->capacity - bs->available, bs->stream);
        bs->available += read;
        bs->index = 0;
    }

    size_t peeked = min(len, bs->available);
    *result = bs->buffer + bs->index;
    return peeked;
}

// Returns the actual number of bytes dropped (may be less than len).
size_t drop(struct buffered_stream *bs, size_t len) {
    len = min(len, bs->capacity);

    if (bs->available < len) {
        memmove(bs->buffer, bs->buffer + bs->index, bs->available);
        size_t read = fread(bs->buffer + bs->available, sizeof(*bs->buffer),
                            bs->capacity - bs->available, bs->stream);
        bs->available += read;
        bs->index = 0;
    }

    size_t dropped = min(len, bs->available);
    bs->index += dropped;
    bs->available -= dropped;
    return dropped;
}

// TODO: Stripping of pre-existing syntax highlighting may be better done as a
// layer in the peek/drop functions themselves, in order to simplify multiple
// other functions which need to consider pre-existing syntax highlighting
void handle_pre_existing_syntax(struct buffered_stream *bs, FILE *out) {
    while (true) {
        char *buffer;

        if (!peek(bs, 1, &buffer)) {
            break;
        } else if (*buffer == '>') {
            drop(bs, 1);
            break;
        } else {
            drop(bs, 1);
        }
    }
}

// TODO: For multi-line comments made up of multiple single-line comments, use a
// single span block rather than multiple
void handle_single_line_comment(struct buffered_stream *bs, FILE *out) {
    fputs("//", out);

    while (true) {
        char *buffer;
        size_t available = peek(bs, 7, &buffer);

        if (!available) {
            fputs("", out);
            break;
        } else if (available >= 7 && !memcmp(buffer, "", 7)) {
            // Normally stripping of any previously existing syntax would occur
            // in the parent function, but since single-line comments swallow
            // until newline, handling needs to be duplicated here
            drop(bs, 7);
        } else if (*buffer == '\n') {
            fputs("\n", out);
            drop(bs, 1);
            break;
        } else {
            putc(*buffer, out);
            drop(bs, 1);
        }
    }
}

void handle_multi_line_comment(struct buffered_stream *bs, FILE *out) {
    fputs("/*", out);

    while (true) {
        char *buffer;
        size_t available = peek(bs, 2, &buffer);

        if (!available) {
            fputs("", out);
            break;
        } else if (available >= 2 && !memcmp(buffer, "*/", 2)) {
            fputs("*/", out);
            drop(bs, 2);
            break;
        } else {
            putc(*buffer, out);
            drop(bs, 1);
        }
    }
}

void handle_string_literal(struct buffered_stream *bs, FILE *out) {
    fputs("\"", out);

    while (true) {
        char *buffer;
        size_t available = peek(bs, 2, &buffer);

        if (!available) {
            fputs("", out);
            break;
        } else if (*buffer == '"') {
            fputs("\"", out);
            drop(bs, 1);
            break;
        } else if (available >= 2 && !memcmp(buffer, "\\\"", 2)) {
            fwrite(buffer, sizeof(*buffer), 2, out);
            drop(bs, 2);
        } else {
            putc(*buffer, out);
            drop(bs, 1);
        }
    }
}

void handle_character_literal(struct buffered_stream *bs, FILE *out) {
    fputs("'", out);

    while (true) {
        char *buffer;
        size_t available = peek(bs, 2, &buffer);

        if (!available) {
            fputs("", out);
            break;
        } else if (*buffer == '\'') {
            fputs("'", out);
            drop(bs, 1);
            break;
        } else if (available >= 2 && !memcmp(buffer, "\\'", 2)) {
            fwrite(buffer, sizeof(*buffer), 2, out);
            drop(bs, 2);
        } else {
            putc(*buffer, out);
            drop(bs, 1);
        }
    }
}

void handle_identifier(struct buffered_stream *bs, FILE *out) {
    bool is_function_call = false;
    char *buffer;
    size_t len = 1;

    while (true) {
        size_t available = peek(bs, len + 8, &buffer);

        if (available < len + 1) {
            break;
        } else if (isalnum(buffer[len]) || buffer[len] == '_') {
            len += 1;
        } else if (buffer[len] == '(') {
            is_function_call = true;
            break;
        } else if ((available - len) >= 8 &&
                   !memcmp(buffer + len, "(", 8)) {
            // Normally stripping of any previously existing syntax would occur
            // in the parent function, but since function calls don't include
            // the following parentheses, handling needs to be duplicated here
            is_function_call = true;
            break;
        } else {
            break;
        }
    }

    if (is_function_call) {
        fputs("", out);
        fwrite(buffer, sizeof(*buffer), len, out);
        fputs("", out);
    } else if ((len >= 4 && !memcmp(buffer, "void", 4)) ||
               (len >= 5 && !memcmp(buffer, "const", 5)) ||
               (len >= 6 && !memcmp(buffer, "static", 6)) ||
               (len >= 6 && !memcmp(buffer, "signed", 6)) ||
               (len >= 8 && !memcmp(buffer, "unsigned", 8)) ||
               (len >= 4 && !memcmp(buffer, "bool", 4)) ||
               (len >= 4 && !memcmp(buffer, "char", 4)) ||
               (len >= 5 && !memcmp(buffer, "short", 5)) ||
               (len >= 3 && !memcmp(buffer, "int", 3)) ||
               (len >= 4 && !memcmp(buffer, "long", 4)) ||
               (len >= 5 && !memcmp(buffer, "float", 5)) ||
               (len >= 6 && !memcmp(buffer, "double", 6)) ||
               (len >= 7 && !memcmp(buffer, "uint8_t", 7)) ||
               (len >= 8 && !memcmp(buffer, "uint16_t", 8)) ||
               (len >= 8 && !memcmp(buffer, "uint32_t", 8)) ||
               (len >= 8 && !memcmp(buffer, "uint64_t", 8)) ||
               (len >= 9 && !memcmp(buffer, "uint128_t", 9)) ||
               (len >= 6 && !memcmp(buffer, "int8_t", 6)) ||
               (len >= 7 && !memcmp(buffer, "int16_t", 7)) ||
               (len >= 7 && !memcmp(buffer, "int32_t", 7)) ||
               (len >= 7 && !memcmp(buffer, "int64_t", 7)) ||
               (len >= 8 && !memcmp(buffer, "int128_t", 8)) ||
               (len >= 6 && !memcmp(buffer, "size_t", 6)) ||
               (len >= 4 && !memcmp(buffer, "FILE", 4)) ||
               (len >= 7 && !memcmp(buffer, "va_list", 7))) {
        // TODO: A macro or function which combines the length check and memory
        // comparison would reduce duplication and the chance for errors.
        fputs("", out);
        fwrite(buffer, sizeof(*buffer), len, out);
        fputs("", out);
    } else {
        fwrite(buffer, sizeof(*buffer), len, out);
    }

    drop(bs, len);
}

void handle_number_literal(struct buffered_stream *bs, FILE *out) {
    fputs("", out);

    while (true) {
        char *buffer;

        if (!peek(bs, 1, &buffer)) {
            fputs("", out);
            break;
        } else if (isalnum(*buffer) || *buffer == '.') {
            putc(*buffer, out);
            drop(bs, 1);
        } else {
            fputs("", out);
            break;
        }
    }
}

// Highlights C syntax contained within  HTML blocks, until the
// corresponding ending /pre> tags are found. Ignores existing syntax
// highlighting so that this operation is idempotent.
bool highlight_syntax_c(struct buffered_stream *bs, FILE *out) {
    char *buffer;
    size_t available = peek(bs, 13, &buffer);

    if (!available) {
        return false;
    } else if (available >= 13 && !memcmp(buffer, "", 13)) {
        return false;
    } else if (available >= 5 && !memcmp(buffer, "= 7 && !memcmp(buffer, "", 7)) {
        drop(bs, 7);
    } else if (available >= 2 && !memcmp(buffer, "//", 2)) {
        drop(bs, 2);
        handle_single_line_comment(bs, out);
    } else if (available >= 2 && !memcmp(buffer, "/*", 2)) {
        drop(bs, 2);
        handle_multi_line_comment(bs, out);
    } else if (*buffer == '"') {
        drop(bs, 1);
        handle_string_literal(bs, out);
    } else if (*buffer == '\'') {
        drop(bs, 1);
        handle_character_literal(bs, out);
    } else if (isalpha(*buffer) || *buffer == '_') {
        handle_identifier(bs, out);
    } else if (isdigit(*buffer)) {
        handle_number_literal(bs, out);
    } else if (available >= 4 && !memcmp(buffer, "<", 4)) {
        // Only '<' and '&' are escaped, as described here:
        // https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/pre#escaping_ambiguous_characters
        fputs("<", out);
        drop(bs, 4);
    } else if (available >= 5 && !memcmp(buffer, "&", 5)) {
        fputs("&", out);
        drop(bs, 5);
    } else if (*buffer == '<') {
        fputs("<", out);
        drop(bs, 1);
    } else if (*buffer == '&') {
        fputs("&", out);
        drop(bs, 1);
    } else {
        putc(*buffer, out);
        drop(bs, 1);
    }

    return true;
}

// Parses the HTML document to find  tags, and then highlights C
// syntax contained within those tags. Nothing in the rest of the HTML document
// should be modified.
bool highlight_syntax_html(struct buffered_stream *bs, FILE *out) {
    char *buffer;
    size_t available = peek(bs, 11, &buffer);

    if (!available) {
        return false;
    } else if (available >= 11 && !memcmp(buffer, "", 11)) {
        fputs("", out);
        drop(bs, 11);
        while (highlight_syntax_c(bs, out)) {}
    } else {
        putc(*buffer, out);
        drop(bs, 1);
    }

    return true;
}

void highlight_syntax(FILE *in, FILE *out) {
    char buffer[1024];
    struct buffered_stream bs = {
        .stream = in,
        .buffer = buffer,
        .capacity = sizeof(buffer) / sizeof(*buffer),
    };

    while (highlight_syntax_html(&bs, out)) {}
}

int main(void) {
    highlight_syntax(stdin, stdout);
    return 0;
}