Mercurial > dillo_port1.3
view src/misc.c @ 2146:aac29835ca8c
Added an optional label to dillorc's search_url. Format: "[<label> ]<url>"
author | Jorge Arellano Cid <jcid@dillo.org> |
---|---|
date | Sun, 24 Jul 2011 13:47:25 -0400 |
parents | e44231f9c6ca |
children |
line wrap: on
line source
/* * File: misc.c * * Copyright (C) 2000-2007 Jorge Arellano Cid <jcid@dillo.org>, * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #include <assert.h> #include "utf8.hh" #include "msg.h" #include "misc.h" /* * Escape characters as %XX sequences. * Return value: New string. */ char *a_Misc_escape_chars(const char *str, const char *esc_set) { static const char *const hex = "0123456789ABCDEF"; char *p = NULL; Dstr *dstr; int i; dstr = dStr_sized_new(64); for (i = 0; str[i]; ++i) { if (str[i] <= 0x1F || str[i] == 0x7F || strchr(esc_set, str[i])) { dStr_append_c(dstr, '%'); dStr_append_c(dstr, hex[(str[i] >> 4) & 15]); dStr_append_c(dstr, hex[str[i] & 15]); } else { dStr_append_c(dstr, str[i]); } } p = dstr->str; dStr_free(dstr, FALSE); return p; } #define TAB_SIZE 8 /* * Takes a string and converts any tabs to spaces. */ int a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen) { int j, pos = 0, written = 0, old_pos, char_len; uint_t code; static const int combining_char_space = 32; while (*start < end && written < buflen - TAB_SIZE - combining_char_space) { code = a_Utf8_decode(*start, end, &char_len); if (code == '\t') { /* Fill with whitespaces until the next tab. */ old_pos = pos; pos += TAB_SIZE - (pos % TAB_SIZE); for (j = old_pos; j < pos; j++) buf[written++] = ' '; } else { assert(char_len <= 4); for (j = 0; j < char_len; j++) buf[written++] = (*start)[j]; pos++; } *start += char_len; } /* If following chars are combining chars (e.g. accents) add them to the * buffer. We have reserved combining_char_space bytes for this. * If there should be more combining chars, we split nevertheless. */ while (*start < end && written < buflen - 4) { code = a_Utf8_decode(*start, end, &char_len); if (! a_Utf8_combining_char(code)) break; assert(char_len <= 4); for (j = 0; j < char_len; j++) buf[written++] = (*start)[j]; *start += char_len; } return written; } /* TODO: could use dStr ADT! */ typedef struct ContentType_ { const char *str; int len; } ContentType_t; static const ContentType_t MimeTypes[] = { { "application/octet-stream", 24 }, { "application/xhtml+xml", 21 }, { "text/html", 9 }, { "text/plain", 10 }, { "image/gif", 9 }, { "image/png", 9 }, { "image/jpeg", 10 }, { NULL, 0 } }; typedef enum { DT_OCTET_STREAM = 0, DT_TEXT_HTML, DT_TEXT_PLAIN, DT_IMAGE_GIF, DT_IMAGE_PNG, DT_IMAGE_JPG, } DetectedContentType; /* * Detects 'Content-Type' from a data stream sample. * * It uses the magic(5) logic from file(1). Currently, it * only checks the few mime types that Dillo supports. * * 'Data' is a pointer to the first bytes of the raw data. * * Return value: (0 on success, 1 on doubt, 2 on lack of data). */ int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT) { size_t i, non_ascci, non_ascci_text, bin_chars; char *p = Data; int st = 1; /* default to "doubt' */ DetectedContentType Type = DT_OCTET_STREAM; /* default to binary */ /* HTML try */ for (i = 0; i < Size && dIsspace(p[i]); ++i); if ((Size - i >= 5 && !dStrncasecmp(p+i, "<html", 5)) || (Size - i >= 5 && !dStrncasecmp(p+i, "<head", 5)) || (Size - i >= 6 && !dStrncasecmp(p+i, "<title", 6)) || (Size - i >= 14 && !dStrncasecmp(p+i, "<!doctype html", 14)) || /* this line is workaround for FTP through the Squid proxy */ (Size - i >= 17 && !dStrncasecmp(p+i, "<!-- HTML listing", 17))) { Type = DT_TEXT_HTML; st = 0; /* Images */ } else if (Size >= 4 && !dStrncasecmp(p, "GIF8", 4)) { Type = DT_IMAGE_GIF; st = 0; } else if (Size >= 4 && !dStrncasecmp(p, "\x89PNG", 4)) { Type = DT_IMAGE_PNG; st = 0; } else if (Size >= 2 && !dStrncasecmp(p, "\xff\xd8", 2)) { /* JPEG has the first 2 bytes set to 0xffd8 in BigEndian - looking * at the character representation should be machine independent. */ Type = DT_IMAGE_JPG; st = 0; /* Text */ } else { /* Heuristic for "text/plain" * {ASCII, LATIN1, UTF8, KOI8-R, CP-1251} * All in the above set regard [00-31] as control characters. * LATIN1: [7F-9F] unused * CP-1251 {7F,98} unused (two characters). * * We'll use [0-31] as indicators of non-text content. * Better heuristics are welcomed! :-) */ non_ascci = non_ascci_text = bin_chars = 0; Size = MIN (Size, 256); for (i = 0; i < Size; i++) { int ch = (uchar_t) p[i]; if (ch < 32 && !dIsspace(ch)) ++bin_chars; if (ch > 126) ++non_ascci; if (ch > 190) ++non_ascci_text; } if (bin_chars == 0 && (non_ascci - non_ascci_text) <= Size/10) { /* Let's say text: if "rare" chars are <= 10% */ Type = DT_TEXT_PLAIN; } else if (Size > 0) { /* a special check for UTF-8 */ Size = a_Utf8_end_of_char(p, Size - 1) + 1; if (a_Utf8_test(p, Size) > 0) Type = DT_TEXT_PLAIN; } if (Size >= 256) st = 0; } *PT = MimeTypes[Type].str; return st; } /* * Parse Content-Type string, e.g., "text/html; charset=utf-8". * Content-Type is defined in RFC 2045 section 5.1. */ void a_Misc_parse_content_type(const char *type, char **major, char **minor, char **charset) { static const char tspecials_space[] = "()<>@,;:\\\"/[]?= "; const char *str, *s; if (major) *major = NULL; if (minor) *minor = NULL; if (charset) *charset = NULL; if (!(str = type)) return; for (s = str; *s && !iscntrl((uchar_t)*s) && !strchr(tspecials_space, *s); s++) ; if (major) *major = dStrndup(str, s - str); if (*s == '/') { for (str = ++s; *s && !iscntrl((uchar_t)*s) && !strchr(tspecials_space, *s); s++) ; if (minor) *minor = dStrndup(str, s - str); } if (charset && *s && (dStrncasecmp(type, "text/", 5) == 0 || dStrncasecmp(type, "application/xhtml+xml", 21) == 0)) { /* "charset" parameter defined for text media type in RFC 2046, * application/xhtml+xml in RFC 3236. * * Note that RFC 3023 lists some main xml media types and provides * the convention of using the "+xml" minor type suffix for other * xml types, so it would be reasonable to check for that suffix if * we have need to care about various xml types someday. */ const char terminators[] = " ;\t"; const char key[] = "charset"; if ((s = dStristr(str, key)) && (s == str || strchr(terminators, s[-1]))) { s += sizeof(key) - 1; for ( ; *s == ' ' || *s == '\t'; ++s); if (*s == '=') { size_t len; for (++s; *s == ' ' || *s == '\t'; ++s); if ((len = strcspn(s, terminators))) { if (*s == '"' && s[len-1] == '"' && len > 1) { /* quoted string */ s++; len -= 2; } *charset = dStrndup(s, len); } } } } } /* * Compare two Content-Type strings. * Return 0 if they are equivalent, and 1 otherwise. */ int a_Misc_content_type_cmp(const char *ct1, const char *ct2) { char *major1, *major2, *minor1, *minor2, *charset1, *charset2; int ret; if ((!ct1 || !*ct1) && (!ct2 || !*ct2)) return 0; if ((!ct1 || !*ct1) || (!ct2 || !*ct2)) return 1; a_Misc_parse_content_type(ct1, &major1, &minor1, &charset1); a_Misc_parse_content_type(ct2, &major2, &minor2, &charset2); if (major1 && major2 && !dStrcasecmp(major1, major2) && minor1 && minor2 && !dStrcasecmp(minor1, minor2) && ((!charset1 && !charset2) || (charset1 && charset2 && !dStrcasecmp(charset1, charset2)) || (!charset1 && charset2 && !dStrcasecmp(charset2, "UTF-8")) || (charset1 && !charset2 && !dStrcasecmp(charset1, "UTF-8")))) { ret = 0; } else { ret = 1; } dFree(major1); dFree(major2); dFree(minor1); dFree(minor2); dFree(charset1); dFree(charset2); return ret; } /* * Check the server-supplied 'Content-Type' against our detected type. * (some servers seem to default to "text/plain"). * * Return value: * 0, if they match * -1, if a mismatch is detected * * There are many MIME types Dillo doesn't know, they're handled * as "application/octet-stream" (as the SPEC says). * * A mismatch happens when receiving a binary stream as * "text/plain" or "text/html", or an image that's not an image of its kind. * * Note: this is a basic security procedure. * */ int a_Misc_content_type_check(const char *EntryType, const char *DetectedType) { int i; int st = -1; _MSG("Type check: [Srv: %s Det: %s]\n", EntryType, DetectedType); if (!EntryType) return 0; /* there's no mismatch without server type */ for (i = 1; MimeTypes[i].str; ++i) if (dStrncasecmp(EntryType, MimeTypes[i].str, MimeTypes[i].len) == 0) break; if (!MimeTypes[i].str) { /* type not found, no mismatch */ st = 0; } else if (dStrncasecmp(EntryType, "image/", 6) == 0 && !dStrncasecmp(DetectedType,MimeTypes[i].str,MimeTypes[i].len)){ /* An image, and there's an exact match */ st = 0; } else if (dStrncasecmp(EntryType, "text/", 5) || dStrncasecmp(DetectedType, "application/", 12)) { /* Not an application sent as text */ st = 0; } else if (dStrncasecmp(EntryType, "application/xhtml+xml", 21) && dStrncasecmp(DetectedType, "text/html", 9)) { /* XML version of HTML */ st = 0; } _MSG("Type check: %s\n", st == 0 ? "MATCH" : "MISMATCH"); return st; } /* * Parse a geometry string. */ int a_Misc_parse_geometry(char *str, int *x, int *y, int *w, int *h) { char *p, *t1, *t2; int n1, n2; int ret = 0; if ((p = strchr(str, 'x')) || (p = strchr(str, 'X'))) { n1 = strtol(str, &t1, 10); n2 = strtol(++p, &t2, 10); if (t1 != str && t2 != p) { *w = n1; *h = n2; ret = 1; /* parse x,y now */ p = t2; n1 = strtol(p, &t1, 10); n2 = strtol(t1, &t2, 10); if (t1 != p && t2 != t1) { *x = n1; *y = n2; } } } _MSG("geom: w,h,x,y = (%d,%d,%d,%d)\n", *w, *h, *x, *y); return ret; } /* * Parse dillorc's search_url string ("[<label> ]<url>") * Return value: -1 on error, 0 on success (and label and urlstr pointers) */ int a_Misc_parse_search_url(char *source, char **label, char **urlstr) { static char buf[32]; char *p, *q; int ret = -1; if ((p = strrchr(source, ' '))) { /* label and url pair */ strncpy(buf,source,MIN(p-source,31)); buf[MIN(p-source,31)] = 0; source = p+1; if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) { *urlstr = source; ret = 0; } } else { /* url only, make a custom label */ if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) { strncpy(buf,p+2,MIN(q-p-2,31)); buf[MIN(q-p-2,31)] = 0; *urlstr = source; ret = 0; } } *label = buf; return ret; } /* * Encodes string using base64 encoding. * Return value: new string or NULL if input string is empty. */ char *a_Misc_encode_base64(const char *in) { static const char *const base64_hex = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "0123456789+/"; char *out = NULL; int len, i = 0; if (in == NULL) return NULL; len = strlen(in); out = (char *)dMalloc((len + 2) / 3 * 4 + 1); for (; len >= 3; len -= 3) { out[i++] = base64_hex[in[0] >> 2]; out[i++] = base64_hex[((in[0]<<4) & 0x30) | (in[1]>>4)]; out[i++] = base64_hex[((in[1]<<2) & 0x3c) | (in[2]>>6)]; out[i++] = base64_hex[in[2] & 0x3f]; in += 3; } if (len > 0) { unsigned char fragment; out[i++] = base64_hex[in[0] >> 2]; fragment = (in[0] << 4) & 0x30; if (len > 1) fragment |= in[1] >> 4; out[i++] = base64_hex[fragment]; out[i++] = (len < 2) ? '=' : base64_hex[(in[1] << 2) & 0x3c]; out[i++] = '='; } out[i] = '\0'; return out; } /* * Load a local file into a dStr. * Return value: dStr on success, NULL on error. * TODO: a filesize threshold may be implemented. */ Dstr *a_Misc_file2dstr(const char *filename) { FILE *F_in; int n; char buf[4096]; Dstr *dstr = NULL; if ((F_in = fopen(filename, "r"))) { dstr = dStr_sized_new(4096); while ((n = fread (buf, 1, 4096, F_in)) > 0) { dStr_append_l(dstr, buf, n); } fclose(F_in); } return dstr; }