| class EmlParser {
/// Parse raw .eml Data into a ParsedEmail
static func parse(data: Data) -> ParsedEmail {
guard let rawString = String(data: data, encoding: .utf8)
?? String(data: data, encoding: .ascii) else {
return ParsedEmail(subject: "", dateHeader: "", from: "", bodyText: "")
}
let headers = extractHeaders(from: rawString)
let subject = decodeHeaderValue(headers["subject"] ?? "")
let dateHeader = headers["date"] ?? ""
let from = decodeHeaderValue(headers["from"] ?? "")
let bodyText = extractBodyText(from: rawString)
return ParsedEmail(subject: subject, dateHeader: dateHeader, from: from, bodyText: bodyText)
}
// MARK: - Header Parsing
/// Extract all headers as a dictionary (lowercased keys)
private static func extractHeaders(from raw: String) -> [String: String] {
var headers: [String: String] = [:]
// Headers end at the first blank line
let headerSection: String
if let range = raw.range(of: "\r\n\r\n") {
headerSection = String(raw[raw.startIndex..<range.lowerBound])
} else if let range = raw.range(of: "\n\n") {
headerSection = String(raw[raw.startIndex..<range.lowerBound])
} else {
headerSection = raw
}
// Unfold continuation lines (lines starting with whitespace are continuations)
let unfolded = headerSection
.replacingOccurrences(of: "\r\n ", with: " ")
.replacingOccurrences(of: "\r\n\t", with: " ")
.replacingOccurrences(of: "\n ", with: " ")
.replacingOccurrences(of: "\n\t", with: " ")
let lines = unfolded.components(separatedBy: .newlines)
for line in lines {
guard let colonIndex = line.firstIndex(of: ":") else { continue }
let key = String(line[line.startIndex..<colonIndex]).trimmingCharacters(in: .whitespaces).lowercased()
let value = String(line[line.index(after: colonIndex)...]).trimmingCharacters(in: .whitespaces)
headers[key] = value
}
return headers
}
/// Decode RFC 2047 encoded-words (e.g. =?UTF-8?B?...?= or =?UTF-8?Q?...?=)
private static func decodeHeaderValue(_ value: String) -> String {
let pattern = "=\\?([^?]+)\\?([BbQq])\\?([^?]+)\\?="
guard let regex = try? NSRegularExpression(pattern: pattern) else { return value }
var result = value
let matches = regex.matches(in: value, range: NSRange(value.startIndex..., in: value))
for match in matches.reversed() {
guard let fullRange = Range(match.range, in: value),
let charsetRange = Range(match.range(at: 1), in: value),
let encodingRange = Range(match.range(at: 2), in: value),
let dataRange = Range(match.range(at: 3), in: value) else { continue }
let charset = String(value[charsetRange])
let encoding = String(value[encodingRange]).uppercased()
let encodedData = String(value[dataRange])
let cfEncoding = CFStringConvertIANACharSetNameToEncoding(charset as CFString)
let nsEncoding = CFStringConvertEncodingToNSStringEncoding(cfEncoding)
let stringEncoding = String.Encoding(rawValue: nsEncoding)
var decoded: String?
if encoding == "B" {
// Base64
if let data = Data(base64Encoded: encodedData) {
decoded = String(data: data, encoding: stringEncoding)
}
} else if encoding == "Q" {
// Quoted-printable (with _ for space)
let qp = encodedData.replacingOccurrences(of: "_", with: " ")
decoded = decodeQuotedPrintable(qp, encoding: stringEncoding)
}
if let decoded = decoded {
result = result.replacingCharacters(in: fullRange, with: decoded)
}
}
return result
}
// MARK: - Body Extraction
/// Extract the plain-text body from the email
private static func extractBodyText(from raw: String) -> String {
// Find where body starts (after first blank line)
let bodyStart: String.Index
if let range = raw.range(of: "\r\n\r\n") {
bodyStart = range.upperBound
} else if let range = raw.range(of: "\n\n") {
bodyStart = range.upperBound
} else {
return ""
}
let body = String(raw[bodyStart...])
let headers = extractHeaders(from: raw)
let contentType = headers["content-type"] ?? ""
let transferEncoding = (headers["content-transfer-encoding"] ?? "").lowercased()
// Check if multipart
if contentType.lowercased().contains("multipart") {
return extractFromMultipart(body: body, contentType: contentType)
}
// Single part — decode if needed, then strip HTML if applicable
let decoded = decodePart(body, transferEncoding: transferEncoding)
if contentType.lowercased().contains("text/html") {
return stripHtml(decoded)
}
return decoded
}
/// Extract text/plain from a multipart message
private static func extractFromMultipart(body: String, contentType: String) -> String {
// Extract boundary
guard let boundary = extractBoundary(from: contentType) else {
return stripHtml(body)
}
let delimiter = "--\(boundary)"
let parts = body.components(separatedBy: delimiter)
var plainText = ""
var htmlText = ""
for part in parts {
let trimmed = part.trimmingCharacters(in: .whitespacesAndNewlines)
if trimmed.isEmpty || trimmed == "--" { continue }
let partHeaders = extractHeaders(from: trimmed)
let partContentType = (partHeaders["content-type"] ?? "").lowercased()
let partEncoding = (partHeaders["content-transfer-encoding"] ?? "").lowercased()
// Get part body
let partBody: String
if let range = trimmed.range(of: "\r\n\r\n") {
partBody = String(trimmed[range.upperBound...])
} else if let range = trimmed.range(of: "\n\n") {
partBody = String(trimmed[range.upperBound...])
} else {
continue
}
let decoded = decodePart(partBody, transferEncoding: partEncoding)
if partContentType.contains("text/plain") {
plainText = decoded
} else if partContentType.contains("text/html") {
htmlText = decoded
} else if partContentType.contains("multipart") {
// Nested multipart
let nested = extractFromMultipart(body: partBody, contentType: partHeaders["content-type"] ?? "")
if !nested.isEmpty {
plainText = nested
}
}
}
// Prefer plain text, fall back to stripped HTML
if !plainText.isEmpty {
return plainText
} else if !htmlText.isEmpty {
return stripHtml(htmlText)
}
return stripHtml(body)
}
private static func extractBoundary(from contentType: String) -> String? {
// Match boundary="value" or boundary=value
let pattern = "boundary\\s*=\\s*\"?([^\";\\s]+)\"?"
guard let regex = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive),
let match = regex.firstMatch(in: contentType, range: NSRange(contentType.startIndex..., in: contentType)),
let range = Range(match.range(at: 1), in: contentType) else {
return nil
}
return String(contentType[range])
}
// MARK: - Decoding
private static func decodePart(_ text: String, transferEncoding: String) -> String {
if transferEncoding.contains("base64") {
let cleaned = text.components(separatedBy: .whitespacesAndNewlines).joined()
if let data = Data(base64Encoded: cleaned),
let decoded = String(data: data, encoding: .utf8) {
return stripHtml(decoded)
}
} else if transferEncoding.contains("quoted-printable") {
return decodeQuotedPrintable(text, encoding: .utf8) ?? text
}
return text
}
private static func decodeQuotedPrintable(_ text: String, encoding: String.Encoding) -> String? {
var result = text
.replacingOccurrences(of: "=\r\n", with: "")
.replacingOccurrences(of: "=\n", with: "")
let pattern = "=([0-9A-Fa-f]{2})"
guard let regex = try? NSRegularExpression(pattern: pattern) else { return result }
let matches = regex.matches(in: result, range: NSRange(result.startIndex..., in: result))
for match in matches.reversed() {
guard let fullRange = Range(match.range, in: result),
let hexRange = Range(match.range(at: 1), in: result) else { continue }
let hex = String(result[hexRange])
if let byte = UInt8(hex, radix: 16) {
let char = String(UnicodeScalar(byte))
result = result.replacingCharacters(in: fullRange, with: char)
}
}
return result
}
/// Simple HTML tag stripper
private static func stripHtml(_ html: String) -> String {
var text = html
// Remove style and script blocks
let blockPattern = "<(style|script)[^>]*>.*?</\\1>"
if let regex = try? NSRegularExpression(pattern: blockPattern, options: [.caseInsensitive, .dotMatchesLineSeparators]) {
text = regex.stringByReplacingMatches(in: text, range: NSRange(text.startIndex..., in: text), withTemplate: "")
}
// Replace <br> and </p> with newlines
text = text.replacingOccurrences(of: "<br[^>]*>", with: "\n", options: .regularExpression, range: nil)
text = text.replacingOccurrences(of: "</p>", with: "\n", options: .caseInsensitive)
text = text.replacingOccurrences(of: "</div>", with: "\n", options: .caseInsensitive)
// Strip remaining tags
text = text.replacingOccurrences(of: "<[^>]+>", with: "", options: .regularExpression, range: nil)
// Decode common HTML entities
text = text.replacingOccurrences(of: "&", with: "&")
text = text.replacingOccurrences(of: "<", with: "<")
text = text.replacingOccurrences(of: ">", with: ">")
text = text.replacingOccurrences(of: """, with: "\"")
text = text.replacingOccurrences(of: "'", with: "'")
text = text.replacingOccurrences(of: "'", with: "'")
text = text.replacingOccurrences(of: " ", with: " ")
// Collapse whitespace
text = text.replacingOccurrences(of: "[ \t]+", with: " ", options: .regularExpression)
// Collapse multiple newlines
text = text.replacingOccurrences(of: "\n{3,}", with: "\n\n", options: .regularExpression)
return text.trimmingCharacters(in: .whitespacesAndNewlines)
}
} | `EmlParser` class | Defines the `EmlParser` class. |