snac2/xs_regex.h

/* copyright (c) 2022 - 2024 grunfink et al. / MIT license */

#ifndef _XS_REGEX_H

#define _XS_REGEX_H

int xs_regex_match(const char *str, const char *rx);
xs_list *xs_regex_split_n(const char *str, const char *rx, int count);
#define xs_regex_split(str, rx) xs_regex_split_n(str, rx, XS_ALL)
xs_list *xs_regex_select_n(const char *str, const char *rx, int count);
#define xs_regex_select(str, rx) xs_regex_select_n(str, rx, XS_ALL)
xs_list *xs_regex_replace_in(xs_str *str, const char *rx, const char *rep, int count);
#define xs_regex_replace_i(str, rx, rep) xs_regex_replace_in(str, rx, rep, XS_ALL)
#define xs_regex_replace_n(str, rx, rep, count) xs_regex_replace_in(xs_dup(str), rx, rep, count)
#define xs_regex_replace(str, rx, rep) xs_regex_replace_in(xs_dup(str), rx, rep, XS_ALL)

#ifdef XS_IMPLEMENTATION

#ifdef __TINYC__
/* fix a compilation error in tcc */
#define _REGEX_NELTS(n)
#endif

#include <regex.h>

xs_list *xs_regex_split_n(const char *str, const char *rx, int count)
/* splits str using regex as a separator, at most count times.
    Always returns a list:
    len == 0: regcomp error
    len == 1: full string (no matches)
    len == odd: first part [ separator / next part ]...
*/
{
    regex_t re;
    regmatch_t rm;
    int offset = 0;
    xs_list *list = xs_list_new();
    const char *p;

    if (regcomp(&re, rx, REG_EXTENDED))
        return list;

    while (count > 0 && !regexec(&re, (p = str + offset), 1, &rm, offset > 0 ? REG_NOTBOL : 0)) {
        /* add first the leading part of the string */
        xs *s1 = xs_str_new_sz(p, rm.rm_so);
        list = xs_list_append(list, s1);

        /* add now the matched text as the separator */
        xs *s2 = xs_str_new_sz(p + rm.rm_so, rm.rm_eo - rm.rm_so);
        list = xs_list_append(list, s2);

        /* move forward */
        offset += rm.rm_eo;

        count--;
    }

    /* add the rest of the string */
    list = xs_list_append(list, p);

    regfree(&re);

    return list;
}


xs_list *xs_regex_select_n(const char *str, const char *rx, int count)
/* selects all matches and return them as a list */
{
    xs_list *list = xs_list_new();
    xs *split = NULL;
    const xs_val *v;
    int n = 0;
    int c = 0;

    /* split */
    split = xs_regex_split_n(str, rx, count);

    /* now iterate to get only the 'separators' (odd ones) */
    while (xs_list_next(split, &v, &c)) {
        if (n & 0x1)
            list = xs_list_append(list, v);

        n++;
    }

    return list;
}


xs_list *xs_regex_replace_in(xs_str *str, const char *rx, const char *rep, int count)
/* replaces all matches with the rep string. If it contains unescaped &,
   they are replaced with the match */
{
    xs_str *s = xs_str_new(NULL);
    xs *split = xs_regex_split_n(str, rx, count);
    const xs_val *v;
    int n = 0;
    int c = 0;
    int pholder = !!strchr(rep, '&');

    while (xs_list_next(split, &v, &c)) {
        if (n & 0x1) {
            if (pholder) {
                /* rep has a placeholder; process char by char */
                const char *p = rep;

                while (*p) {
                    if (*p == '&')
                        s = xs_str_cat(s, v);
                    else {
                        if (*p == '\\')
                            p++;

                        if (!*p)
                            break;

                        s = xs_append_m(s, p, 1);
                    }

                    p++;
                }
            }
            else
                s = xs_str_cat(s, rep);
        }
        else
            s = xs_str_cat(s, v);

        n++;
    }

    xs_free(str);

    return s;
}


int xs_regex_match(const char *str, const char *rx)
/* returns if str matches the regex at least once */
{
    xs *l = xs_regex_select_n(str, rx, 1);

    return xs_list_len(l) == 1;
}


#endif /* XS_IMPLEMENTATION */

#endif /* XS_REGEX_H */