Add function to get unicode category of given rune - rcx - library of miscellaneous bits of C code

commit dfce8a98479c80a98ce8c1017394c7f2d93e4814
parent c88e97c7aac0df9bdb9664c7bb9c1a72cfab277e
Author: robert <robertrussell.72001@gmail.com>
Date:   Wed, 10 Aug 2022 18:56:19 -0700

Add function to get unicode category of given rune

Much better than using ICU or some other awful library.

Diffstat:
M .gitignore  | 2 ++
M Makefile  | 16 +++++++++++++++-
M config.mk  | 2 ++
M inc/cext/all.h  | 1 +
A inc/cext/unicode.h  | 5 +++++
A src/unicode.c  | 36 ++++++++++++++++++++++++++++++++++++
A tool/ucattab.c  | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

7 files changed, 172 insertions(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 *.o
 *.a
+gen/*
+tool/ucattab
diff --git a/Makefile b/Makefile
@@ -8,6 +8,7 @@ SRC =\
 	src/log.c\
 	src/opt.c\
 	src/str.c\
+	src/unicode.c\
 	src/utf8.c
 
 libcext.a: $(SRC:.c=.o)
@@ -21,9 +22,22 @@ src/bench.o: src/bench.c inc/cext/bench.h inc/cext/cext.h inc/cext/def.h inc/cex
 src/log.o: src/log.c inc/cext/cext.h inc/cext/def.h inc/cext/log.h config.mk
 src/opt.o: src/opt.c inc/cext/cext.h inc/cext/def.h inc/cext/opt.h config.mk
 src/str.o: src/str.c inc/cext/alloc.h inc/cext/cext.h inc/cext/def.h inc/cext/log.h inc/cext/str.h config.mk
+src/unicode.o: src/unicode.c inc/cext/cext.h inc/cext/def.h gen/ucattab.inc config.mk
 src/utf8.o: src/utf8.c inc/cext/cext.h inc/cext/def.h inc/cext/utf8.h config.mk
 
+gen/ucattab.inc: gen tool/ucattab gen/UnicodeData.txt
+	tool/ucattab gen/UnicodeData.txt > $@
+
+gen/UnicodeData.txt: gen
+	wget -q -O $@ $(UNICODE_DATA_URL)
+
+gen:
+	mkdir gen
+
+tool/ucattab: tool/ucattab.c src/alloc.o src/log.o src/str.o
+	$(CC) -o $@ $(CFLAGS) $^
+
 clean:
-	rm -f libcext.a $(SRC:.c=.o)
+	rm -rf libcext.a $(SRC:.c=.o) gen
 
 .PHONY: clean
diff --git a/config.mk b/config.mk
@@ -1,3 +1,5 @@
 CC = cc
 CFLAGS = -std=c11 -O2 -Wall -pedantic -Iinc
 AR = ar
+
+UNICODE_DATA_URL = https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
diff --git a/inc/cext/all.h b/inc/cext/all.h
@@ -5,5 +5,6 @@
 #include "cext/log.h"
 #include "cext/opt.h"
 #include "cext/str.h"
+#include "cext/unicode.h"
 #include "cext/utf8.h"
 #include "cext/vector.h"
diff --git a/inc/cext/unicode.h b/inc/cext/unicode.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "cext/def.h"
+
+char *cext_unicode_category(rune r);
diff --git a/src/unicode.c b/src/unicode.c
@@ -0,0 +1,36 @@
+#include "cext/cext.h"
+#include "../gen/ucattab.inc"
+
+static char ucats[] =
+	"Lu\0Ll\0Lt\0Lm\0Lo\0"
+	"Mn\0Mc\0Me\0"
+	"Nd\0Nl\0No\0"
+	"Pc\0Pd\0Ps\0Pe\0Pi\0Pf\0Po\0"
+	"Sm\0Sc\0Sk\0So\0"
+	"Zs\0Zl\0Zp\0"
+	"Cc\0Cf\0Cs\0Co\0Cn";
+
+char *
+cext_unicode_category(rune r) {
+	if (r <= 0xff) /* Latin 1 */
+		return &ucats[3 * ucatl1tab[r]];
+
+	if (r > RUNE_MAX)
+		return 0;
+
+	/* Binary search ucattab */
+	usize l = 0;
+	usize u = LEN(ucattab);
+	while (l < u) {
+		usize i = l + (u-l)/2;
+		u32 entry = ucattab[i];
+		if (r < (entry & 0xffffff))
+			u = i;
+		else if (i + 1 < LEN(ucattab) && r >= (ucattab[i+1] & 0xffffff))
+			l = i + 1;
+		else
+			return &ucats[3 * (entry >> 24)];
+	}
+
+	return 0;
+}
diff --git a/tool/ucattab.c b/tool/ucattab.c
@@ -0,0 +1,111 @@
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "cext/alloc.h"
+#include "cext/cext.h"
+#include "cext/log.h"
+#include "cext/str.h"
+
+#define NF 15 /* Number of fields in UnicodeData.txt */
+
+u8
+cattoi(char *cat) {
+	static char ucats[] =
+		"Lu\0Ll\0Lt\0Lm\0Lo\0"
+		"Mn\0Mc\0Me\0"
+		"Nd\0Nl\0No\0"
+		"Pc\0Pd\0Ps\0Pe\0Pi\0Pf\0Po\0"
+		"Sm\0Sc\0Sk\0So\0"
+		"Zs\0Zl\0Zp\0"
+		"Cc\0Cf\0Cs\0Co\0Cn";
+	for (usize i = 0; i < LEN(ucats); i += 3) {
+		if (!strcmp(cat, &ucats[i]))
+			return i / 3;
+	}
+	fatalf("bad category '%s'", cat);
+	return 0; /* Suppress warning */
+}
+
+u8 *
+parse_cats(char *filename) {
+	FILE *f = fopen(filename, "rb");
+	if (!f) fatalf("fopen: %s", strerror(errno));
+
+	u8 *cats = ealloc(RUNE_MAX + 1);
+
+	char line[512];
+	bool inrange = false;
+	i32 prevcp = -1;
+	while (fgets(line, sizeof line, f)) {
+		char *nl = strchr(line, '\n');
+		if (!nl) fatalf("line too long");
+		*nl = '\0';
+
+		char **fields = (char *[NF]){0};
+		if (cext_str_split(&fields, line, ";", NF) != NF)
+			fatalf("line has too few fields");
+		i32 cp = strtol(fields[0], 0, 16);
+		char *name = fields[1];
+		u8 cat = cattoi(!strcmp(fields[2], "") ? "Cn" : fields[2]);
+
+		/* We expect UnicodeData.txt to be sorted, but I can't find any
+		 * guarantee of that in UAX #44. */
+		assert(cp > prevcp);
+		assert(!inrange || cext_str_ends_with(name, "Last>"));
+
+		for (i32 c = prevcp+1; c <= cp; c++)
+			cats[c] = inrange || c == cp ? cat : cattoi("Cn");
+
+		inrange = cext_str_ends_with(name, "First>");
+		prevcp = cp;
+	}
+	if (!feof(f)) fatalf("fgets: %s", strerror(errno));
+
+	for (i32 c = prevcp+1; c <= RUNE_MAX; c++)
+		cats[c] = cattoi("Cn");
+
+	fclose(f);
+
+	return cats;
+}
+
+int
+main(int argc, char **argv) {
+	if (argc != 2)
+		fatalf("usage: %s UNICODE_DATA_FILE", argv[0]);
+
+	u8 *cats = parse_cats(argv[1]);
+
+	printf(
+		"/* This file is automatically generated. */\n"
+		"\n"
+		"/* Special table to optimize for Latin 1 */\n"
+		"static u8 ucatl1tab[] = {"
+	);
+
+	for (rune i = 0; i <= 0xff; i++)
+		printf("%s0x%02x,", i%8 == 0 ? "\n\t" : " ", cats[i]);
+
+	printf(
+		"\n}; /* 256 bytes */\n"
+		"\n"
+		"/* The high byte is the category, and the low three bytes are the\n"
+		" * codepoint. Codepoints are listed in order, with consecutive\n"
+		" * entries with the same category compressed into one entry. */\n"
+		"static u32 ucattab[] = {"
+	);
+
+	usize tablen = 0;
+	for (rune i = 0x100; i <= RUNE_MAX; i++) {
+		if (i > 0x100 && cats[i] == cats[i-1])
+			continue;
+		printf("%s0x%08"PRIx32"ul,", tablen%4 == 0 ? "\n\t" : " ",
+				((rune) cats[i] << 24) | i);
+		tablen++;
+	}
+
+	printf("\n}; /* %zd bytes */\n", 4 * tablen);
+}

	rcx library of miscellaneous bits of C code
	git clone git://git.rr3.xyz/rcx
	Log \| Files \| Refs \| README \| LICENSE

M	.gitignore	\|	2	++
M	Makefile	\|	16	+++++++++++++++-
M	config.mk	\|	2	++
M	inc/cext/all.h	\|	1	+
A	inc/cext/unicode.h	\|	5	+++++
A	src/unicode.c	\|	36	++++++++++++++++++++++++++++++++++++
A	tool/ucattab.c	\|	111	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++