changeset 250:fce9380ee68a

- Switched from charset to content-type for handling data. - Fixed data guesser to detect ASCII, LATIN1, UTF8, KOI8-R, CP-1251 as text.
author jcid
date Thu, 29 May 2008 02:19:08 +0200
parents 343dba6aa5d7
children c56887b79eca
files ChangeLog src/cache.c src/cache.h src/capi.c src/capi.h src/html.cc src/misc.c src/misc.h
diffstat 8 files changed, 230 insertions(+), 86 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Wed May 28 23:54:36 2008 +0200
+++ b/ChangeLog	Thu May 29 02:19:08 2008 +0200
@@ -103,6 +103,7 @@
  - Switched Window::destroy to Window::delete, fixing side effects.
  - Made zlib a configure requirement, and cleaned up configure.in.
  - Fixed a segfault bug in Nav.c.
+ - Switched from charset to content-type for handling data.
    Patches: place (AKA corvid)
 +- Fixed a problem with locally-installed dpis.
  - Added code for optional image loading (nice interface) very advanced!
@@ -114,6 +115,7 @@
  - Added a_Capi_get_flags(). It requests a cache entry's status as flags.
  - Switched URL_DATA type from char* to a dStr.
  - Implemented the file input control for forms.
+ - Fixed data guesser to detect ASCII, LATIN1, UTF8, KOI8-R, CP-1251 as text.
    Patch: place, Jorge Arellano Cid
 +- Fixed a cookies-related dillo freeze bug happening at:
      http://www.fltk.org/newsgroups.php?gfltk.general+v:24912
--- a/src/cache.c	Wed May 28 23:54:36 2008 +0200
+++ b/src/cache.c	Thu May 29 02:19:08 2008 +0200
@@ -54,6 +54,7 @@
    const DilloUrl *Url;      /* Cached Url. Url is used as a primary Key */
    char *TypeDet;            /* MIME type string (detected from data) */
    char *TypeHdr;            /* MIME type string as from the HTTP Header */
+   char *TypeMeta;           /* MIME type string from META HTTP-EQUIV */
    Dstr *Header;             /* HTTP header */
    const DilloUrl *Location; /* New URI for redirects */
    Dstr *Data;               /* Pointer to raw data */
@@ -202,6 +203,7 @@
    NewEntry->Url = a_Url_dup(Url);
    NewEntry->TypeDet = NULL;
    NewEntry->TypeHdr = NULL;
+   NewEntry->TypeMeta = NULL;
    NewEntry->Header = dStr_new("");
    NewEntry->Location = NULL;
    NewEntry->Data = dStr_sized_new(8*1024);
@@ -290,6 +292,7 @@
    a_Url_free((DilloUrl *)entry->Url);
    dFree(entry->TypeDet);
    dFree(entry->TypeHdr);
+   dFree(entry->TypeMeta);
    dStr_free(entry->Header, TRUE);
    a_Url_free((DilloUrl *)entry->Location);
    dStr_free(entry->Data, 1);
@@ -390,6 +393,47 @@
 }
 
 /*
+ * Get current content type.
+ */
+static const char *Cache_current_content_type(CacheEntry_t *entry)
+{
+   return entry->TypeMeta ? entry->TypeMeta : entry->TypeHdr ? entry->TypeHdr :
+          entry->TypeDet;
+}
+
+/*
+ * Get current Content-Type for cache entry found by URL.
+ */
+const char *a_Cache_get_content_type(const DilloUrl *url)
+{
+   CacheEntry_t *entry = Cache_entry_search_with_redirect(url);
+
+   return (entry) ? Cache_current_content_type(entry) : NULL;
+}
+
+/*
+ * Change Content-Type for cache entry found by url.
+ * Return new content type.
+ */
+const char *a_Cache_set_content_type(const DilloUrl *url, const char *ctype,
+                                     bool_t force)
+{
+   const char *ret;
+   CacheEntry_t *entry = Cache_entry_search_with_redirect(url);
+
+   if (!entry) {
+      ret = NULL;
+   } else {
+      if (force == TRUE || entry->TypeMeta == NULL) {
+         dFree(entry->TypeMeta);
+         entry->TypeMeta = dStrdup(ctype);
+      }
+      ret = entry->TypeMeta;
+   }
+   return ret;
+}
+
+/*
  * Get the pointer to the URL document, and its size, from the cache entry.
  * Return: 1 cached, 0 not cached.
  */
@@ -831,7 +875,7 @@
    if (!(entry->Flags & CA_GotContentType)) {
       st = a_Misc_get_content_type_from_data(
               entry->Data->str, entry->Data->len, &Type);
-      _MSG("Cache: detected Content-Type '%s'\n", Type);
+      MSG("Cache: detected Content-Type '%s'\n", Type);
       if (st == 0 || entry->Flags & CA_GotData) {
          if (a_Misc_content_type_check(entry->TypeHdr, Type) < 0) {
             MSG_HTTP("Content-Type '%s' doesn't match the real data.\n",
@@ -885,9 +929,9 @@
             if (TypeMismatch) {
                AbortEntry = TRUE;
             } else {
-               st = a_Web_dispatch_by_type(
-                       entry->TypeHdr ? entry->TypeHdr : entry->TypeDet,
-                       ClientWeb, &Client->Callback, &Client->CbData);
+               st = a_Web_dispatch_by_type(Cache_current_content_type(entry),
+                                           ClientWeb, &Client->Callback,
+                                           &Client->CbData);
                if (st == -1) {
                   /* MIME type is not viewable */
                   if (ClientWeb->flags & WEB_RootUrl) {
--- a/src/cache.h	Wed May 28 23:54:36 2008 +0200
+++ b/src/cache.h	Thu May 29 02:19:08 2008 +0200
@@ -59,6 +59,9 @@
 void a_Cache_init(void);
 int a_Cache_open_url(void *Web, CA_Callback_t Call, void *CbData);
 int a_Cache_get_buf(const DilloUrl *Url, char **PBuf, int *BufSize);
+const char *a_Cache_get_content_type(const DilloUrl *url);
+const char *a_Cache_set_content_type(const DilloUrl *url, const char *ctype,
+                                     bool_t force);
 uint_t a_Cache_get_flags(const DilloUrl *url);
 void a_Cache_process_dbuf(int Op, const char *buf, size_t buf_size,
                           const DilloUrl *Url);
--- a/src/capi.c	Wed May 28 23:54:36 2008 +0200
+++ b/src/capi.c	Thu May 29 02:19:08 2008 +0200
@@ -406,6 +406,23 @@
 }
 
 /*
+ * Get the Content-Type associated with the URL
+ */
+const char *a_Capi_get_content_type(const DilloUrl *url)
+{
+   return a_Cache_get_content_type(url);
+}
+
+/*
+ * Set the Content-Type for the URL. 
+ */
+const char *a_Capi_set_content_type(const DilloUrl *url, const char *ctype,
+                                        bool_t force)
+{
+   return a_Cache_set_content_type(url, ctype, force);
+}
+
+/*
  * Send a dpi cmd.
  * (For instance: add_bookmark, open_url, send_preferences, ...)
  */
--- a/src/capi.h	Wed May 28 23:54:36 2008 +0200
+++ b/src/capi.h	Thu May 29 02:19:08 2008 +0200
@@ -24,6 +24,9 @@
 void a_Capi_init(void);
 int a_Capi_open_url(DilloWeb *web, CA_Callback_t Call, void *CbData);
 int a_Capi_get_buf(const DilloUrl *Url, char **PBuf, int *BufSize);
+const char *a_Capi_get_content_type(const DilloUrl *url);
+const char *a_Capi_set_content_type(const DilloUrl *url, const char *ctype,
+                                    bool_t force);
 int a_Capi_get_flags(const DilloUrl *Url);
 int a_Capi_dpi_send_cmd(DilloUrl *url, void *bw, char *cmd, char *server,
                          int flags);
--- a/src/html.cc	Wed May 28 23:54:36 2008 +0200
+++ b/src/html.cc	Thu May 29 02:19:08 2008 +0200
@@ -302,7 +302,7 @@
 public:  //BUG: for now everything is public
 
    BrowserWindow *bw;
-   DilloUrl *base_url;
+   DilloUrl *page_url, *base_url;
    dw::core::Widget *dw;    /* this is duplicated in the stack */
 
    /* -------------------------------------------------------------------*/
@@ -311,9 +311,9 @@
    size_t Buf_Consumed; /* amount of source from cache consumed */
    Dstr *Local_Buf;    /* source converted to displayable encoding (UTF-8) */
    int Local_Ofs;
-   char *charset;
-   bool using_meta_charset; /* to handle multiple meta_charset tags */
    Decode *decoder;
+   char *content_type, *charset;
+   bool stop_parser;
 
    size_t CurrTagOfs;
    size_t OldTagOfs, OldTagLine;
@@ -363,7 +363,7 @@
    void initDw();  /* Used by the constructor */
 
 public:
-   DilloHtml(BrowserWindow *bw, const DilloUrl *url, const char *charset);
+   DilloHtml(BrowserWindow *bw, const DilloUrl *url, const char *content_type);
    ~DilloHtml();
    void connectSignals(dw::core::Widget *dw);
    void write(char *Buf, int BufSize, int Eof);
@@ -430,11 +430,6 @@
 } TagInfo;
 extern const TagInfo Tags[];
 
-/* todo: implement this as an URL/charset pair in a DList.
- * chances of this bare-bones implementation to fail are minimal though:
- * two ROOT pages using meta-charset, parsing HEAD section at the same time */
-static char *meta_charset = NULL;
-
 /*-----------------------------------------------------------------------------
  *-----------------------------------------------------------------------------
  * Main Code
@@ -493,39 +488,12 @@
 }
 
 /*
- * Get charset string from HTTP Content-Type string.
- */
-static char *Html_get_charset(const char *ct)
-{
-   const char key[] = "charset";
-   const char terminators[] = " ;\t";
-   char *start;
-   size_t len;
-
-   if ((start = dStristr(ct, "charset")) &&
-       (start == ct || strchr(terminators, start[-1]))) {
-      start += sizeof(key) - 1;
-      for ( ; *start == ' ' || *start == '\t'; ++start);
-      if (*start == '=') {
-         for (++start; *start == ' ' || *start == '\t'; ++start);
-         _MSG("Html_get_charset: %s\n", start);
-         if ((len = strcspn(start, terminators)))
-            return dStrndup(start, len);
-      }
-   }
-   return NULL;
-}
-
-/*
  * Set callback function and callback data for the "html/text" MIME type.
  */
 void *a_Html_text(const char *Type, void *P, CA_Callback_t *Call, void **Data)
 {
    DilloWeb *web = (DilloWeb*)P;
-   char *charset = Html_get_charset(Type);
-   DilloHtml *html = new DilloHtml(web->bw, web->url, charset);
-
-   dFree(charset);
+   DilloHtml *html = new DilloHtml(web->bw, web->url, Type);
 
    *Data = (void*)html;
    *Call = (CA_Callback_t)Html_callback;
@@ -778,13 +746,14 @@
  * Create and initialize a new DilloHtml class
  */
 DilloHtml::DilloHtml(BrowserWindow *p_bw, const DilloUrl *url,
-                     const char *charset)
+                     const char *content_type)
 {
    /* Init event receiver */
    linkReceiver.html = this;
 
    /* Init main variables */
    bw = p_bw;
+   page_url = a_Url_dup(url);
    base_url = a_Url_dup(url);
    dw = NULL;
 
@@ -795,22 +764,14 @@
    Local_Buf = dStr_new("");
    Local_Ofs = 0;
 
-   if (charset) {
-      MSG("HTTP Content-Type gave charset as: %s\n", charset);
-   }
-   if (meta_charset) {
-      MSG("META Content-Type gave charset as: %s\n", meta_charset);
-   }
-   if (meta_charset) {
-      decoder = a_Decode_charset_init(meta_charset);
-      this->charset = meta_charset;
-      using_meta_charset = true;
-      meta_charset = NULL;
-   } else {
-      decoder = a_Decode_charset_init(charset);
-      this->charset = dStrdup(charset);
-      using_meta_charset = false;
-   }
+   MSG("HTML content type: %s\n", content_type);
+   this->content_type = dStrdup(content_type);
+
+   /* get charset */
+   a_Misc_parse_content_type(content_type, NULL, NULL, &charset);
+
+   decoder = a_Decode_charset_init(charset);
+   stop_parser = false;
 
    CurrTagOfs = 0;
    OldTagOfs = 0;
@@ -921,6 +882,7 @@
 
    a_Bw_remove_doc(bw, this);
 
+   a_Url_free(page_url);
    a_Url_free(base_url);
 
    for (int i = 0; i < forms->size(); i++)
@@ -1027,6 +989,7 @@
 
    a_Decode_free(decoder);
    dStr_free(Local_Buf, TRUE);
+   dFree(content_type);
    dFree(charset);
 }
 
@@ -3959,19 +3922,20 @@
          }
          dStr_free(ds_msg, 1);
 
-      } else if (!html->using_meta_charset &&
-                 !dStrcasecmp(equiv, "content-type") &&
+      } else if (!dStrcasecmp(equiv, "content-type") &&
                  (content = Html_get_attr(html, tag, tagsize, "content"))) {
-         char *charset = Html_get_charset(content);
-         if (charset) {
-            if (!html->charset || dStrcasecmp(charset, html->charset)) {
-               MSG("META Content-Type changes charset to: %s\n", charset);
-               dFree(meta_charset);
-               meta_charset = dStrdup(charset);
+         if (a_Misc_content_type_cmp(html->content_type, content)) {
+            const bool_t force = FALSE;
+            const char *new_content =
+               a_Capi_set_content_type(html->page_url, content, force);
+            /* Cannot ask cache whether the content type was changed, as
+             * this code in another bw might have already changed it for us.
+             */
+            if (a_Misc_content_type_cmp(html->content_type, new_content)) {
                a_Nav_repush(html->bw);
+               html->stop_parser = true;
             }
          }
-         dFree(charset);
       }   
    }
 }
@@ -5873,7 +5837,7 @@
     * boundary. Iterate through tokens until end of buffer is reached. */
    buf_index = 0;
    token_start = buf_index;
-   while (buf_index < bufsize) {
+   while ((buf_index < bufsize) && (html->stop_parser == false)) {
       /* invariant: buf_index == bufsize || token_start == buf_index */
 
       if (S_TOP(html)->parse_mode ==
--- a/src/misc.c	Wed May 28 23:54:36 2008 +0200
+++ b/src/misc.c	Thu May 29 02:19:08 2008 +0200
@@ -93,6 +93,15 @@
    { NULL, 0 }
 };
 
+typedef enum {
+   DT_OCTET_STREAM = 0,
+   DT_TEXT_HTML,
+   DT_TEXT_PLAIN,
+   DT_IMAGE_GIF,
+   DT_IMAGE_PNG,
+   DT_IMAGE_JPG,
+} DetectedContentType;
+
 /*
  * Detects 'Content-Type' from a data stream sample.
  *
@@ -105,10 +114,10 @@
  */
 int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
 {
+   size_t i, non_ascci, non_ascci_text, bin_chars;
+   char *p = Data;
    int st = 1;      /* default to "doubt' */
-   int Type = 0;    /* default to "application/octet-stream" */
-   char *p = Data;
-   size_t i, non_ascci;
+   DetectedContentType Type = DT_OCTET_STREAM; /* default to binary */
 
    /* HTML try */
    for (i = 0; i < Size && isspace(p[i]); ++i);
@@ -119,36 +128,50 @@
        /* this line is workaround for FTP through the Squid proxy */
        (Size - i >= 17 && !dStrncasecmp(p+i, "<!-- HTML listing", 17))) {
 
-      Type = 1;
+      Type = DT_TEXT_HTML;
       st = 0;
    /* Images */
    } else if (Size >= 4 && !dStrncasecmp(p, "GIF8", 4)) {
-      Type = 3;
+      Type = DT_IMAGE_GIF;
       st = 0;
    } else if (Size >= 4 && !dStrncasecmp(p, "\x89PNG", 4)) {
-      Type = 4;
+      Type = DT_IMAGE_PNG;
       st = 0;
    } else if (Size >= 2 && !dStrncasecmp(p, "\xff\xd8", 2)) {
       /* JPEG has the first 2 bytes set to 0xffd8 in BigEndian - looking
        * at the character representation should be machine independent. */
-      Type = 5;
+      Type = DT_IMAGE_JPG;
       st = 0;
 
    /* Text */
    } else {
-      /* We'll assume "text/plain" if the set of chars above 127 is <= 10
-       * in a 256-bytes sample.  Better heuristics are welcomed! :-) */
-      non_ascci = 0;
+      /* Heuristic for "text/plain"
+       * {ASCII, LATIN1, UTF8, KOI8-R, CP-1251}
+       * All in the above set regard [00-31] as control characters.
+       * LATIN1: [7F-9F] unused
+       * CP-1251 {7F,98} unused (two characters).
+       * 
+       * We'll use [0-31] as indicators of non-text content.
+       * Better heuristics are welcomed! :-) */
+
+      non_ascci = non_ascci_text = bin_chars = 0;
       Size = MIN (Size, 256);
-      for (i = 0; i < Size; i++)
-         if ((uchar_t) p[i] > 127)
+      for (i = 0; i < Size; i++) {
+         int ch = (uchar_t) p[i];
+         if (ch < 32 && !isspace(ch))
+            ++bin_chars;
+         if (ch > 126)
             ++non_ascci;
-      if (Size == 256) {
-         Type = (non_ascci > 10) ? 0 : 2;
+         if (ch > 190)
+            ++non_ascci_text;
+      }
+      if (bin_chars == 0) {
+         /* Let's say text: if "rare" chars are <= 10% */
+         if ((non_ascci - non_ascci_text) <= Size/10)
+            Type = DT_TEXT_PLAIN;
+      }
+      if (Size == 256)
          st = 0;
-      } else {
-         Type = (non_ascci > 0) ? 0 : 2;
-      }
    }
 
    *PT = MimeTypes[Type].str;
@@ -156,6 +179,91 @@
 }
 
 /*
+ * Parse Content-Type string, e.g., "text/html; charset=utf-8".
+ */
+void a_Misc_parse_content_type(const char *str, char **major, char **minor,
+                               char **charset)
+{
+   const char *s;
+
+   if (major)
+      *major = NULL;
+   if (minor)
+      *minor = NULL;
+   if (charset)
+      *charset = NULL;
+   if (!str)
+      return;
+
+   for (s = str; isalnum(*s) || (*s == '-'); s++);
+   if (major)
+      *major = dStrndup(str, s - str);
+
+   if (*s == '/') {
+      for (str = ++s; isalnum(*s) || (*s == '-'); s++);
+      if (minor)
+         *minor = dStrndup(str, s - str);
+   }
+
+   if (charset && *s) {
+      const char terminators[] = " ;\t";
+      const char key[] = "charset";
+
+      if ((s = dStristr(str, key)) &&
+          (s == str || strchr(terminators, s[-1]))) {
+         s += sizeof(key) - 1;
+         for ( ; *s == ' ' || *s == '\t'; ++s);
+         if (*s == '=') {
+            size_t len;
+            for (++s; *s == ' ' || *s == '\t'; ++s);
+            if ((len = strcspn(s, terminators))) {
+               if (*s == '"' && s[len-1] == '"' && len > 1) {
+                 /* quoted string */
+                 s++;
+                 len -= 2;
+               }
+               *charset = dStrndup(s, len);
+            }
+         }
+      }
+   }
+}
+
+/*
+ * Compare two Content-Type strings.
+ * Return 0 if they are equivalent, and 1 otherwise.
+ */
+int a_Misc_content_type_cmp(const char *ct1, const char *ct2)
+{
+   char *major1, *major2, *minor1, *minor2, *charset1, *charset2;
+   int ret;
+
+   if ((!ct1 || !*ct1) && (!ct2 || !*ct2))
+      return 0;
+   if ((!ct1 || !*ct1) || (!ct2 || !*ct2))
+      return 1;
+
+   a_Misc_parse_content_type(ct1, &major1, &minor1, &charset1);
+   a_Misc_parse_content_type(ct2, &major2, &minor2, &charset2);
+
+   if (major1 && major2 && !dStrcasecmp(major1, major2) &&
+       minor1 && minor2 && !dStrcasecmp(minor1, minor2) &&
+       ((!charset1 && !charset2) ||
+        (charset1 && charset2 && !dStrcasecmp(charset1, charset2)) ||
+        (!charset1 && charset2 && !dStrcasecmp(charset2, "UTF-8")) ||
+        (charset1 && !charset2 && !dStrcasecmp(charset1, "UTF-8")))) {
+      ret = 0;
+   } else {
+      ret = 1;
+   }
+   dFree(major1); dFree(major2);
+   dFree(minor1); dFree(minor2);
+   dFree(charset1); dFree(charset2);
+
+   return ret;
+}
+
+/*
  * Check the server-supplied 'Content-Type' against our detected type.
  * (some servers seem to default to "text/plain").
  *
@@ -177,7 +285,7 @@
    int i;
    int st = -1;
 
-   _MSG("Type check:  [Srv: %s  Det: %s]\n", EntryType, DetectedType);
+   MSG("Type check:  [Srv: %s  Det: %s]\n", EntryType, DetectedType);
 
    if (!EntryType)
       return 0; /* there's no mismatch without server type */
--- a/src/misc.h	Wed May 28 23:54:36 2008 +0200
+++ b/src/misc.h	Thu May 29 02:19:08 2008 +0200
@@ -13,6 +13,9 @@
 char *a_Misc_expand_tabs(const char *str);
 int a_Misc_get_content_type_from_data(void *Data, size_t Size,const char **PT);
 int a_Misc_content_type_check(const char *EntryType, const char *DetectedType);
+void a_Misc_parse_content_type(const char *str, char **major, char **minor,
+                               char **charset);
+int a_Misc_content_type_cmp(const char* ct1, const char *ct2);
 int a_Misc_parse_geometry(char *geom, int *x, int *y, int *w, int *h);
 char *a_Misc_encode_base64(const char *in);
 Dstr *a_Misc_file2dstr(const char *filename);