← Back to index

EmlParser

SpotsShare
CodeWhat It DoesHow It Does It
▶ IMPORTS
import FoundationFramework importsImports Foundation.
struct ParsedEmail { let subject: String let dateHeader: String let from: String let bodyText: String }`ParsedEmail` structDefines the `ParsedEmail` struct.
class EmlParser { /// Parse raw .eml Data into a ParsedEmail static func parse(data: Data) -> ParsedEmail { guard let rawString = String(data: data, encoding: .utf8) ?? String(data: data, encoding: .ascii) else { return ParsedEmail(subject: "", dateHeader: "", from: "", bodyText: "") } let headers = extractHeaders(from: rawString) let subject = decodeHeaderValue(headers["subject"] ?? "") let dateHeader = headers["date"] ?? "" let from = decodeHeaderValue(headers["from"] ?? "") let bodyText = extractBodyText(from: rawString) return ParsedEmail(subject: subject, dateHeader: dateHeader, from: from, bodyText: bodyText) } // MARK: - Header Parsing /// Extract all headers as a dictionary (lowercased keys) private static func extractHeaders(from raw: String) -> [String: String] { var headers: [String: String] = [:] // Headers end at the first blank line let headerSection: String if let range = raw.range(of: "\r\n\r\n") { headerSection = String(raw[raw.startIndex..<range.lowerBound]) } else if let range = raw.range(of: "\n\n") { headerSection = String(raw[raw.startIndex..<range.lowerBound]) } else { headerSection = raw } // Unfold continuation lines (lines starting with whitespace are continuations) let unfolded = headerSection .replacingOccurrences(of: "\r\n ", with: " ") .replacingOccurrences(of: "\r\n\t", with: " ") .replacingOccurrences(of: "\n ", with: " ") .replacingOccurrences(of: "\n\t", with: " ") let lines = unfolded.components(separatedBy: .newlines) for line in lines { guard let colonIndex = line.firstIndex(of: ":") else { continue } let key = String(line[line.startIndex..<colonIndex]).trimmingCharacters(in: .whitespaces).lowercased() let value = String(line[line.index(after: colonIndex)...]).trimmingCharacters(in: .whitespaces) headers[key] = value } return headers } /// Decode RFC 2047 encoded-words (e.g. =?UTF-8?B?...?= or =?UTF-8?Q?...?=) private static func decodeHeaderValue(_ value: String) -> String { let pattern = "=\\?([^?]+)\\?([BbQq])\\?([^?]+)\\?=" guard let regex = try? NSRegularExpression(pattern: pattern) else { return value } var result = value let matches = regex.matches(in: value, range: NSRange(value.startIndex..., in: value)) for match in matches.reversed() { guard let fullRange = Range(match.range, in: value), let charsetRange = Range(match.range(at: 1), in: value), let encodingRange = Range(match.range(at: 2), in: value), let dataRange = Range(match.range(at: 3), in: value) else { continue } let charset = String(value[charsetRange]) let encoding = String(value[encodingRange]).uppercased() let encodedData = String(value[dataRange]) let cfEncoding = CFStringConvertIANACharSetNameToEncoding(charset as CFString) let nsEncoding = CFStringConvertEncodingToNSStringEncoding(cfEncoding) let stringEncoding = String.Encoding(rawValue: nsEncoding) var decoded: String? if encoding == "B" { // Base64 if let data = Data(base64Encoded: encodedData) { decoded = String(data: data, encoding: stringEncoding) } } else if encoding == "Q" { // Quoted-printable (with _ for space) let qp = encodedData.replacingOccurrences(of: "_", with: " ") decoded = decodeQuotedPrintable(qp, encoding: stringEncoding) } if let decoded = decoded { result = result.replacingCharacters(in: fullRange, with: decoded) } } return result } // MARK: - Body Extraction /// Extract the plain-text body from the email private static func extractBodyText(from raw: String) -> String { // Find where body starts (after first blank line) let bodyStart: String.Index if let range = raw.range(of: "\r\n\r\n") { bodyStart = range.upperBound } else if let range = raw.range(of: "\n\n") { bodyStart = range.upperBound } else { return "" } let body = String(raw[bodyStart...]) let headers = extractHeaders(from: raw) let contentType = headers["content-type"] ?? "" let transferEncoding = (headers["content-transfer-encoding"] ?? "").lowercased() // Check if multipart if contentType.lowercased().contains("multipart") { return extractFromMultipart(body: body, contentType: contentType) } // Single part — decode if needed, then strip HTML if applicable let decoded = decodePart(body, transferEncoding: transferEncoding) if contentType.lowercased().contains("text/html") { return stripHtml(decoded) } return decoded } /// Extract text/plain from a multipart message private static func extractFromMultipart(body: String, contentType: String) -> String { // Extract boundary guard let boundary = extractBoundary(from: contentType) else { return stripHtml(body) } let delimiter = "--\(boundary)" let parts = body.components(separatedBy: delimiter) var plainText = "" var htmlText = "" for part in parts { let trimmed = part.trimmingCharacters(in: .whitespacesAndNewlines) if trimmed.isEmpty || trimmed == "--" { continue } let partHeaders = extractHeaders(from: trimmed) let partContentType = (partHeaders["content-type"] ?? "").lowercased() let partEncoding = (partHeaders["content-transfer-encoding"] ?? "").lowercased() // Get part body let partBody: String if let range = trimmed.range(of: "\r\n\r\n") { partBody = String(trimmed[range.upperBound...]) } else if let range = trimmed.range(of: "\n\n") { partBody = String(trimmed[range.upperBound...]) } else { continue } let decoded = decodePart(partBody, transferEncoding: partEncoding) if partContentType.contains("text/plain") { plainText = decoded } else if partContentType.contains("text/html") { htmlText = decoded } else if partContentType.contains("multipart") { // Nested multipart let nested = extractFromMultipart(body: partBody, contentType: partHeaders["content-type"] ?? "") if !nested.isEmpty { plainText = nested } } } // Prefer plain text, fall back to stripped HTML if !plainText.isEmpty { return plainText } else if !htmlText.isEmpty { return stripHtml(htmlText) } return stripHtml(body) } private static func extractBoundary(from contentType: String) -> String? { // Match boundary="value" or boundary=value let pattern = "boundary\\s*=\\s*\"?([^\";\\s]+)\"?" guard let regex = try? NSRegularExpression(pattern: pattern, options: .caseInsensitive), let match = regex.firstMatch(in: contentType, range: NSRange(contentType.startIndex..., in: contentType)), let range = Range(match.range(at: 1), in: contentType) else { return nil } return String(contentType[range]) } // MARK: - Decoding private static func decodePart(_ text: String, transferEncoding: String) -> String { if transferEncoding.contains("base64") { let cleaned = text.components(separatedBy: .whitespacesAndNewlines).joined() if let data = Data(base64Encoded: cleaned), let decoded = String(data: data, encoding: .utf8) { return stripHtml(decoded) } } else if transferEncoding.contains("quoted-printable") { return decodeQuotedPrintable(text, encoding: .utf8) ?? text } return text } private static func decodeQuotedPrintable(_ text: String, encoding: String.Encoding) -> String? { var result = text .replacingOccurrences(of: "=\r\n", with: "") .replacingOccurrences(of: "=\n", with: "") let pattern = "=([0-9A-Fa-f]{2})" guard let regex = try? NSRegularExpression(pattern: pattern) else { return result } let matches = regex.matches(in: result, range: NSRange(result.startIndex..., in: result)) for match in matches.reversed() { guard let fullRange = Range(match.range, in: result), let hexRange = Range(match.range(at: 1), in: result) else { continue } let hex = String(result[hexRange]) if let byte = UInt8(hex, radix: 16) { let char = String(UnicodeScalar(byte)) result = result.replacingCharacters(in: fullRange, with: char) } } return result } /// Simple HTML tag stripper private static func stripHtml(_ html: String) -> String { var text = html // Remove style and script blocks let blockPattern = "<(style|script)[^>]*>.*?</\\1>" if let regex = try? NSRegularExpression(pattern: blockPattern, options: [.caseInsensitive, .dotMatchesLineSeparators]) { text = regex.stringByReplacingMatches(in: text, range: NSRange(text.startIndex..., in: text), withTemplate: "") } // Replace <br> and </p> with newlines text = text.replacingOccurrences(of: "<br[^>]*>", with: "\n", options: .regularExpression, range: nil) text = text.replacingOccurrences(of: "</p>", with: "\n", options: .caseInsensitive) text = text.replacingOccurrences(of: "</div>", with: "\n", options: .caseInsensitive) // Strip remaining tags text = text.replacingOccurrences(of: "<[^>]+>", with: "", options: .regularExpression, range: nil) // Decode common HTML entities text = text.replacingOccurrences(of: "&amp;", with: "&") text = text.replacingOccurrences(of: "&lt;", with: "<") text = text.replacingOccurrences(of: "&gt;", with: ">") text = text.replacingOccurrences(of: "&quot;", with: "\"") text = text.replacingOccurrences(of: "&#39;", with: "'") text = text.replacingOccurrences(of: "&apos;", with: "'") text = text.replacingOccurrences(of: "&nbsp;", with: " ") // Collapse whitespace text = text.replacingOccurrences(of: "[ \t]+", with: " ", options: .regularExpression) // Collapse multiple newlines text = text.replacingOccurrences(of: "\n{3,}", with: "\n\n", options: .regularExpression) return text.trimmingCharacters(in: .whitespacesAndNewlines) } }`EmlParser` classDefines the `EmlParser` class.