-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathMHTMLParser.cs
313 lines (301 loc) · 9.81 KB
/
MHTMLParser.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
using System;
using System.IO;
using System.Text;
using System.Collections.Generic;
/// <summary>
/// HTMLParser is an object that can decode mhtml into ASCII text.
/// Using getHTMLText() will generate static HTML with inline images.
/// </summary>
public class MHTMLParser
{
const string BOUNDARY = "boundary";
const string CHAR_SET = "charset";
const string CONTENT_TYPE = "Content-Type";
const string CONTENT_TRANSFER_ENCODING = "Content-Transfer-Encoding";
const string CONTENT_LOCATION = "Content-Location";
const string FILE_NAME = "filename=";
private string mhtmlString; // the string we want to decode
private string log; // log file
public bool decodeImageData; //decode images?
/*
* Results of Conversion
* This is split into a string[3] for each part
* string[0] is the content type
* string[1] is the content name
* string[2] is the converted data
*/
public List<string[]> dataset;
/*
* Default Constructor
*/
public MHTMLParser()
{
dataset = new List<string[]>(); //Init dataset
log += "Initialized dataset.\n";
decodeImageData = false; //Set default for decoding images
}
/*
* Init with contents of string
*/
public MHTMLParser(string mhtml)
: this()
{
SetMHTMLString(mhtml);
}
/*
* Init with contents of string, and decoding option
*/
public MHTMLParser(string mhtml, bool decodeImages)
: this(mhtml)
{
decodeImageData = decodeImages;
}
/*
* Set the mhtml string we want to decode
*/
public void SetMHTMLString(string mhtml)
{
try
{
mhtmlString = mhtml ?? throw new Exception("The mhtml string is null"); //Set String
log += "Set mhtml string.\n";
}
catch (Exception e)
{
log += e.Message;
log += e.StackTrace;
}
}
/*
* Decompress Archive From String
*/
public List<string[]> DecompressString()
{
// init Prerequisites
StringReader reader = null;
string type = "";
string encoding = "";
string location = "";
string filename = "";
string charset = "utf-8";
StringBuilder buffer = null;
log += "Starting decompression \n";
try
{
reader = new StringReader(mhtmlString); //Start reading the string
string boundary = GetBoundary(reader); // Get the boundary code
if (boundary == null) throw new Exception("Failed to find string 'boundary'");
log += "Found boundary.\n";
//Loop through each line in the string
string line = null;
while ((line = reader.ReadLine()) != null)
{
string temp = line.Trim();
if (temp.Contains(boundary)) //Check if this is a new section
{
if (buffer != null) //If this is a new section and the buffer is full, write to dataset
{
string[] data = new string[3];
data[0] = type;
data[1] = filename!=""?filename:location;
data[2] = WriteBufferContent(buffer, encoding, charset, type, decodeImageData);
dataset.Add(data);
buffer = null;
log += "Wrote Buffer Content and reset buffer.\n";
}
buffer = new StringBuilder();
}
else if (temp.StartsWith(CONTENT_TYPE))
{
type = GetAttribute(temp);
log += "Got content type.\n";
}
else if (temp.StartsWith(CHAR_SET))
{
charset = GetCharSet(temp);
log += "Got charset.\n";
}
else if (temp.StartsWith(CONTENT_TRANSFER_ENCODING))
{
encoding = GetAttribute(temp);
log += "Got encoding (" + encoding + ").\n";
}
else if (temp.StartsWith(CONTENT_LOCATION))
{
location = temp.Substring(temp.IndexOf(":") + 1).Trim();
log += "Got location.\n";
}
else if (temp.StartsWith(FILE_NAME))
{
char c = '"';
filename = temp.Substring(temp.IndexOf(c.ToString()) + 1, temp.LastIndexOf(c.ToString()) - temp.IndexOf(c.ToString()) - 1);
}
else if (temp.StartsWith("Content-ID") || temp.StartsWith("Content-Disposition") || temp.StartsWith("name=") || temp.Length == 1)
{
//We don't need this stuff; Skip lines
}
else
{
if (buffer != null)
{
buffer.Append(line + "\n");
}
}
}
}
finally
{
if (null != reader)
reader.Close();
log += "Closed Reader.\n";
}
return dataset; //Return Results
}
private string WriteBufferContent(StringBuilder buffer, string encoding, string charset, string type, bool decodeImages)
{
log += "Start writing buffer contents.\n";
//Detect if this is an image and if we want to decode it
if (type.Contains("image"))
{
log += "Image Data Detected.\n";
if (!decodeImages)
{
log += "Skipping image decode.\n";
return buffer.ToString();
}
}
// base64 Decoding
if (encoding.ToLower().Equals("base64"))
{
try
{
log += "base64 encoding detected.\n";
log += "Got base64 decoded string.\n";
return DecodeFromBase64(buffer.ToString());
}
catch (Exception e)
{
log += e.Message + "\n";
log += e.StackTrace + "\n";
log += "Data not Decoded.\n";
return buffer.ToString();
}
}
//quoted-printable decoding
else if (encoding.ToLower().Equals("quoted-printable"))
{
log += "Quoted-Prinatble string detected.\n";
return GetQuotedPrintableString(buffer.ToString());
}
else
{
log += "Unknown Encoding.\n";
return buffer.ToString();
}
}
/*
* Take base64 string, get bytes and convert to ascii string
*/
public string DecodeFromBase64(string encodedData)
{
byte[] encodedDataAsBytes
= System.Convert.FromBase64String(encodedData);
string returnValue =
System.Text.ASCIIEncoding.ASCII.GetString(encodedDataAsBytes);
return returnValue;
}
/*
* Get decoded quoted printable string
*/
public string GetQuotedPrintableString(string mimeString)
{
try
{
throw new Exception("Quoted-Printable is not supported.");
}
catch (Exception e)
{
log += e.Message + "\n";
log += e.StackTrace + "\n";
log += "Data not Decoded.\n";
return mimeString;
}
}
/*
* Finds boundary used to break code into multiple parts
*/
private string GetBoundary(StringReader reader)
{
string line = null;
while ((line = reader.ReadLine()) != null)
{
line = line.Trim();
//If the line starts with BOUNDARY, lets grab everything in quotes and return it
if (line.StartsWith(BOUNDARY))
{
char c = '"';
int a = line.IndexOf(c.ToString());
int b = line.LastIndexOf(c.ToString());
return line.Substring(line.IndexOf(c.ToString()) + 1, line.LastIndexOf(c.ToString()) - line.IndexOf(c.ToString()) - 1);
}
}
return null;
}
/*
* Grabs charset from a line
*/
private string GetCharSet(string temp)
{
string t = temp.Split('=')[1].Trim();
return t.Substring(1, t.Length - 1);
}
/*
* split a line on ": "
*/
private string GetAttribute(string line)
{
string str = ": ";
return line.Substring(line.IndexOf(str) + str.Length, line.Length - (line.IndexOf(str) + str.Length)).Replace(";", "");
}
/*
* Get an html page from the mhtml. Embeds images as base64 data
*/
public string GtHTMLText()
{
if (decodeImageData) throw new Exception("Turn off image decoding for valid html output.");
List<string[]> data = DecompressString();
string body = "";
//First, lets write all non-images to mail body
//Then go back and add images in
for (int i = 0; i < 2; i++)
{
foreach (string[] strArray in data)
{
if (i == 0)
{
if (strArray[0].Equals("text/html"))
{
body += strArray[2];
log += "Writing HTML Text\n";
}
}
else if (i == 1)
{
if (strArray[0].Contains("image"))
{
body = body.Replace("cid:" + strArray[1], "data:" + strArray[0] + ";base64," + strArray[2]);
log += "Overwriting HTML with image: " + strArray[1] + "\n";
}
}
}
}
return body;
}
/*
* Get the log from the decoding process
*/
public string GetLog()
{
return log;
}
}