diff src/misc.c @ 250:fce9380ee68a

- Switched from charset to content-type for handling data. - Fixed data guesser to detect ASCII, LATIN1, UTF8, KOI8-R, CP-1251 as text.
author jcid
date Thu, 29 May 2008 02:19:08 +0200
parents 0038a2943cc2
children 1deca4cad4c4
line wrap: on
line diff
--- a/src/misc.c	Wed May 28 23:54:36 2008 +0200
+++ b/src/misc.c	Thu May 29 02:19:08 2008 +0200
@@ -93,6 +93,15 @@
    { NULL, 0 }
 };
 
+typedef enum {
+   DT_OCTET_STREAM = 0,
+   DT_TEXT_HTML,
+   DT_TEXT_PLAIN,
+   DT_IMAGE_GIF,
+   DT_IMAGE_PNG,
+   DT_IMAGE_JPG,
+} DetectedContentType;
+
 /*
  * Detects 'Content-Type' from a data stream sample.
  *
@@ -105,10 +114,10 @@
  */
 int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
 {
+   size_t i, non_ascci, non_ascci_text, bin_chars;
+   char *p = Data;
    int st = 1;      /* default to "doubt' */
-   int Type = 0;    /* default to "application/octet-stream" */
-   char *p = Data;
-   size_t i, non_ascci;
+   DetectedContentType Type = DT_OCTET_STREAM; /* default to binary */
 
    /* HTML try */
    for (i = 0; i < Size && isspace(p[i]); ++i);
@@ -119,36 +128,50 @@
        /* this line is workaround for FTP through the Squid proxy */
        (Size - i >= 17 && !dStrncasecmp(p+i, "<!-- HTML listing", 17))) {
 
-      Type = 1;
+      Type = DT_TEXT_HTML;
       st = 0;
    /* Images */
    } else if (Size >= 4 && !dStrncasecmp(p, "GIF8", 4)) {
-      Type = 3;
+      Type = DT_IMAGE_GIF;
       st = 0;
    } else if (Size >= 4 && !dStrncasecmp(p, "\x89PNG", 4)) {
-      Type = 4;
+      Type = DT_IMAGE_PNG;
       st = 0;
    } else if (Size >= 2 && !dStrncasecmp(p, "\xff\xd8", 2)) {
       /* JPEG has the first 2 bytes set to 0xffd8 in BigEndian - looking
        * at the character representation should be machine independent. */
-      Type = 5;
+      Type = DT_IMAGE_JPG;
       st = 0;
 
    /* Text */
    } else {
-      /* We'll assume "text/plain" if the set of chars above 127 is <= 10
-       * in a 256-bytes sample.  Better heuristics are welcomed! :-) */
-      non_ascci = 0;
+      /* Heuristic for "text/plain"
+       * {ASCII, LATIN1, UTF8, KOI8-R, CP-1251}
+       * All in the above set regard [00-31] as control characters.
+       * LATIN1: [7F-9F] unused
+       * CP-1251 {7F,98} unused (two characters).
+       * 
+       * We'll use [0-31] as indicators of non-text content.
+       * Better heuristics are welcomed! :-) */
+
+      non_ascci = non_ascci_text = bin_chars = 0;
       Size = MIN (Size, 256);
-      for (i = 0; i < Size; i++)
-         if ((uchar_t) p[i] > 127)
+      for (i = 0; i < Size; i++) {
+         int ch = (uchar_t) p[i];
+         if (ch < 32 && !isspace(ch))
+            ++bin_chars;
+         if (ch > 126)
             ++non_ascci;
-      if (Size == 256) {
-         Type = (non_ascci > 10) ? 0 : 2;
+         if (ch > 190)
+            ++non_ascci_text;
+      }
+      if (bin_chars == 0) {
+         /* Let's say text: if "rare" chars are <= 10% */
+         if ((non_ascci - non_ascci_text) <= Size/10)
+            Type = DT_TEXT_PLAIN;
+      }
+      if (Size == 256)
          st = 0;
-      } else {
-         Type = (non_ascci > 0) ? 0 : 2;
-      }
    }
 
    *PT = MimeTypes[Type].str;
@@ -156,6 +179,91 @@
 }
 
 /*
+ * Parse Content-Type string, e.g., "text/html; charset=utf-8".
+ */
+void a_Misc_parse_content_type(const char *str, char **major, char **minor,
+                               char **charset)
+{
+   const char *s;
+
+   if (major)
+      *major = NULL;
+   if (minor)
+      *minor = NULL;
+   if (charset)
+      *charset = NULL;
+   if (!str)
+      return;
+
+   for (s = str; isalnum(*s) || (*s == '-'); s++);
+   if (major)
+      *major = dStrndup(str, s - str);
+
+   if (*s == '/') {
+      for (str = ++s; isalnum(*s) || (*s == '-'); s++);
+      if (minor)
+         *minor = dStrndup(str, s - str);
+   }
+
+   if (charset && *s) {
+      const char terminators[] = " ;\t";
+      const char key[] = "charset";
+
+      if ((s = dStristr(str, key)) &&
+          (s == str || strchr(terminators, s[-1]))) {
+         s += sizeof(key) - 1;
+         for ( ; *s == ' ' || *s == '\t'; ++s);
+         if (*s == '=') {
+            size_t len;
+            for (++s; *s == ' ' || *s == '\t'; ++s);
+            if ((len = strcspn(s, terminators))) {
+               if (*s == '"' && s[len-1] == '"' && len > 1) {
+                 /* quoted string */
+                 s++;
+                 len -= 2;
+               }
+               *charset = dStrndup(s, len);
+            }
+         }
+      }
+   }
+}
+
+/*
+ * Compare two Content-Type strings.
+ * Return 0 if they are equivalent, and 1 otherwise.
+ */
+int a_Misc_content_type_cmp(const char *ct1, const char *ct2)
+{
+   char *major1, *major2, *minor1, *minor2, *charset1, *charset2;
+   int ret;
+
+   if ((!ct1 || !*ct1) && (!ct2 || !*ct2))
+      return 0;
+   if ((!ct1 || !*ct1) || (!ct2 || !*ct2))
+      return 1;
+
+   a_Misc_parse_content_type(ct1, &major1, &minor1, &charset1);
+   a_Misc_parse_content_type(ct2, &major2, &minor2, &charset2);
+
+   if (major1 && major2 && !dStrcasecmp(major1, major2) &&
+       minor1 && minor2 && !dStrcasecmp(minor1, minor2) &&
+       ((!charset1 && !charset2) ||
+        (charset1 && charset2 && !dStrcasecmp(charset1, charset2)) ||
+        (!charset1 && charset2 && !dStrcasecmp(charset2, "UTF-8")) ||
+        (charset1 && !charset2 && !dStrcasecmp(charset1, "UTF-8")))) {
+      ret = 0;
+   } else {
+      ret = 1;
+   }
+   dFree(major1); dFree(major2);
+   dFree(minor1); dFree(minor2);
+   dFree(charset1); dFree(charset2);
+
+   return ret;
+}
+
+/*
  * Check the server-supplied 'Content-Type' against our detected type.
  * (some servers seem to default to "text/plain").
  *
@@ -177,7 +285,7 @@
    int i;
    int st = -1;
 
-   _MSG("Type check:  [Srv: %s  Det: %s]\n", EntryType, DetectedType);
+   MSG("Type check:  [Srv: %s  Det: %s]\n", EntryType, DetectedType);
 
    if (!EntryType)
       return 0; /* there's no mismatch without server type */