This is a piece of code I used to handle encodings not found in the
response stream.
Stream s = resp.GetResponseStream();
byte[] buffer = ReadStream(s); // ReadStream reads the Stream into a byte[]
// time to check encoding
string urlEnc = resp.ContentEncoding;
Encoding e = null;
if(urlEnc.Length > 0)
e = Encoding.GetEncoding(urlEnc);
else
e = Encoding.UTF8;
string temp = e.GetString(buffer, 0, buffer.Length);
// in case if no encoding, redecode the page
if(resp.ContentEncoding.Length == 0)
{
string charset = GetCharSet(resp.ContentType, true);
if(charset == null)
charset = GetCharSet(temp, false);
if(charset != null)
temp = Encoding.GetEncoding(charset).GetString(buffer, 0, buffer.Length);
}
....
// the idea of getcharset is to look for the charset tag in the source
// I forgot why all the details, but those are probably to ensure all
manners of writing will be detected
private static string GetCharSet(string s, bool header)
{
try
{
int i = s.IndexOf("charset"); // try lower case first
if(i == -1)
i = s.IndexOf("CHARSET");
if(i == -1) // charset not found, return
return null;
int j = s.IndexOf("=", i+1);
if(j == -1)
return null;
if(header)
{
int n = s.IndexOf(";", j+1);
if(n == -1)
return s.Substring(j+1);
else
return s.Substring(j+1, n-(j+1));
}
int k = s.IndexOf("\"", j+1);
int l = s.IndexOf(">", j+1);
int m = s.IndexOf("'", j+1);
if(k == -1 && l == -1 && m == -1) // not able to detect end of the
encoding word
return null;
if(k == -1)
k = Int32.MaxValue;
if(l == -1)
l = Int32.MaxValue;
if(m == -1)
l = Int32.MaxValue;
if(k == Int32.MaxValue)
return null;
// the previous eight lines are probably obsolete code I forgot to remove
// if k == -1 the substring wouldn't work
string temp = s.Substring(j+1, k-j-1);
if(temp.Length == 0)
return null;
else
return temp;
}
catch(Exception ex)
{
MessageBox.Show("GetCharSet Error: " + ex.Message);
return null;
}
}