about summary refs log tree commit diff
path: root/src/rt/sundown/html/houdini_href_e.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/rt/sundown/html/houdini_href_e.c')
-rw-r--r--src/rt/sundown/html/houdini_href_e.c108
1 files changed, 108 insertions, 0 deletions
diff --git a/src/rt/sundown/html/houdini_href_e.c b/src/rt/sundown/html/houdini_href_e.c
new file mode 100644
index 00000000000..981b3b17e41
--- /dev/null
+++ b/src/rt/sundown/html/houdini_href_e.c
@@ -0,0 +1,108 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "houdini.h"
+
+#define ESCAPE_GROW_FACTOR(x) (((x) * 12) / 10)
+
+/*
+ * The following characters will not be escaped:
+ *
+ *		-_.+!*'(),%#@?=;:/,+&$ alphanum
+ *
+ * Note that this character set is the addition of:
+ *
+ *	- The characters which are safe to be in an URL
+ *	- The characters which are *not* safe to be in
+ *	an URL because they are RESERVED characters.
+ *
+ * We asume (lazily) that any RESERVED char that
+ * appears inside an URL is actually meant to
+ * have its native function (i.e. as an URL 
+ * component/separator) and hence needs no escaping.
+ *
+ * There are two exceptions: the chacters & (amp)
+ * and ' (single quote) do not appear in the table.
+ * They are meant to appear in the URL as components,
+ * yet they require special HTML-entity escaping
+ * to generate valid HTML markup.
+ *
+ * All other characters will be escaped to %XX.
+ *
+ */
+static const char HREF_SAFE[] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 
+	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+void
+houdini_escape_href(struct buf *ob, const uint8_t *src, size_t size)
+{
+	static const char hex_chars[] = "0123456789ABCDEF";
+	size_t  i = 0, org;
+	char hex_str[3];
+
+	bufgrow(ob, ESCAPE_GROW_FACTOR(size));
+	hex_str[0] = '%';
+
+	while (i < size) {
+		org = i;
+		while (i < size && HREF_SAFE[src[i]] != 0)
+			i++;
+
+		if (i > org)
+			bufput(ob, src + org, i - org);
+
+		/* escaping */
+		if (i >= size)
+			break;
+
+		switch (src[i]) {
+		/* amp appears all the time in URLs, but needs
+		 * HTML-entity escaping to be inside an href */
+		case '&': 
+			BUFPUTSL(ob, "&amp;");
+			break;
+
+		/* the single quote is a valid URL character
+		 * according to the standard; it needs HTML
+		 * entity escaping too */
+		case '\'':
+			BUFPUTSL(ob, "&#x27;");
+			break;
+		
+		/* the space can be escaped to %20 or a plus
+		 * sign. we're going with the generic escape
+		 * for now. the plus thing is more commonly seen
+		 * when building GET strings */
+#if 0
+		case ' ':
+			bufputc(ob, '+');
+			break;
+#endif
+
+		/* every other character goes with a %XX escaping */
+		default:
+			hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
+			hex_str[2] = hex_chars[src[i] & 0xF];
+			bufput(ob, hex_str, 3);
+		}
+
+		i++;
+	}
+}