#Scala Regular Expressions
Regular expressions are powerful tools for processing text and string matching. Scala provides rich regex support, including pattern matching integration and convenient APIs.
#Regex Basics
#Creating Regular Expressions
import scala.util.matching.Regex
object RegexBasics {
def main(args: Array[String]): Unit = {
// Different ways to create regex patterns
val pattern1: Regex = "\\d+".r // Use .r method
val pattern2: Regex = new Regex("\\d+") // Use constructor
val pattern3: Regex = """(\d{4})-(\d{2})-(\d{2})""".r // Raw string
// Regex with flags
val caseInsensitive: Regex = "(?i)hello".r
val multiline: Regex = "(?m)^start".r
println("Regex creation successful")
// Basic matching
val text = "The year 2023 has 365 days"
val numberPattern = "\\d+".r
// Find first match
val firstMatch = numberPattern.findFirstIn(text)
println(s"First number: ${firstMatch.getOrElse("not found")}")
// Find all matches
val allMatches = numberPattern.findAllIn(text).toList
println(s"All numbers: $allMatches")
// Check if matches
val hasNumbers = numberPattern.findFirstIn(text).isDefined
println(s"Contains numbers: $hasNumbers")
}
}#Basic Matching Operations
object BasicMatching {
def main(args: Array[String]): Unit = {
val emailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
val phonePattern = """\d{3}-\d{3}-\d{4}""".r
val datePattern = """\d{4}-\d{2}-\d{2}""".r
val text = """
Contact information:
Email: john.doe@example.com, alice@company.org
Phone: 555-123-4567, 555-987-6543
Date: 2023-12-25, 2024-01-01
"""
// Find all emails
val emails = emailPattern.findAllIn(text).toList
println(s"Email addresses: ${emails.mkString(", ")}")
// Find all phone numbers
val phones = phonePattern.findAllIn(text).toList
println(s"Phone numbers: ${phones.mkString(", ")}")
// Find all dates
val dates = datePattern.findAllIn(text).toList
println(s"Dates: ${dates.mkString(", ")}")
// Use findAllMatchIn for more info
println("\nDetailed match information:")
emailPattern.findAllMatchIn(text).foreach { m =>
println(s"Email: '${m.matched}' position: ${m.start}-${m.end}")
}
}
}#Capture Groups
#Basic Capture Groups
object CaptureGroups {
def main(args: Array[String]): Unit = {
// Define regex patterns with capture groups
val namePattern = """(\w+)\s+(\w+)""".r // First and last name
val emailPattern = """(\w+)@(\w+\.\w+)""".r // Username and domain
val datePattern = """(\d{4})-(\d{2})-(\d{2})""".r // Year, month, day
val text = "John Smith's email is john@example.com, registration date is 2023-12-25"
// Extract names
namePattern.findFirstMatchIn("John Smith") match {
case Some(m) =>
println(s"First name: ${m.group(1)}")
println(s"Last name: ${m.group(2)}")
println(s"Full match: ${m.group(0)}")
case None =>
println("Name pattern not found")
}
// Extract email info
emailPattern.findFirstMatchIn(text) match {
case Some(m) =>
println(s"Username: ${m.group(1)}")
println(s"Domain: ${m.group(2)}")
case None =>
println("Email not found")
}
// Extract date info
datePattern.findFirstMatchIn(text) match {
case Some(m) =>
println(s"Year: ${m.group(1)}")
println(s"Month: ${m.group(2)}")
println(s"Day: ${m.group(3)}")
case None =>
println("Date pattern not found")
}
// Use unapplySeq for pattern matching
val dateString = "2023-12-25"
dateString match {
case datePattern(year, month, day) =>
println(s"Pattern match - Year: $year, Month: $month, Day: $day")
case _ =>
println("Date format doesn't match")
}
}
}#Named Capture Groups
object NamedCaptureGroups {
def main(args: Array[String]): Unit = {
// Java-style named capture groups (Scala 2.13+)
val logPattern = """(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?<level>\w+)\] (?<message>.+)""".r
val logEntry = "2023-12-25 10:30:45 [INFO] Application started successfully"
logPattern.findFirstMatchIn(logEntry) match {
case Some(m) =>
// Access groups by name
println(s"Timestamp: ${m.group("timestamp")}")
println(s"Level: ${m.group("level")}")
println(s"Message: ${m.group("message")}")
case None =>
println("Log format doesn't match")
}
// Alternative: Use regular capture groups and case class
case class LogEntry(timestamp: String, level: String, message: String)
val simpleLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)""".r
def parseLogEntry(log: String): Option[LogEntry] = log match {
case simpleLogPattern(timestamp, level, message) =>
Some(LogEntry(timestamp, level, message))
case _ => None
}
parseLogEntry(logEntry) match {
case Some(entry) =>
println(s"Parsed log: $entry")
case None =>
println("Unable to parse log")
}
}
}#String Replacement
#Basic Replacement Operations
object StringReplacement {
def main(args: Array[String]): Unit = {
val text = "The quick brown fox jumps over the lazy dog. The fox is quick."
// Simple replacement
val pattern1 = "fox".r
val replaced1 = pattern1.replaceAllIn(text, "cat")
println(s"Replace fox -> cat: $replaced1")
// Replace first match only
val replaced2 = pattern1.replaceFirstIn(text, "wolf")
println(s"Replace first fox -> wolf: $replaced2")
// Use capture groups for replacement
val phonePattern = """(\d{3})-(\d{3})-(\d{4})""".r
val phoneText = "Phone numbers: 555-123-4567 and 555-987-6543"
val formattedPhones = phonePattern.replaceAllIn(phoneText, "($1) $2-$3")
println(s"Formatted phones: $formattedPhones")
// Use function for replacement
val numberPattern = """\d+""".r
val numberText = "I have 5 apples and 10 oranges"
val doubledNumbers = numberPattern.replaceAllIn(numberText, m => (m.matched.toInt * 2).toString)
println(s"Numbers doubled: $doubledNumbers")
// Conditional replacement
val wordPattern = """\b\w+\b""".r
val conditionalReplace = wordPattern.replaceAllIn(text, m =>
if (m.matched.length > 4) m.matched.toUpperCase else m.matched
)
println(s"Long words uppercase: $conditionalReplace")
}
}#Advanced Replacement Techniques
object AdvancedReplacement {
def main(args: Array[String]): Unit = {
// HTML tag cleanup
val htmlText = "<p>This is an <strong>important</strong> <em>message</em>.</p>"
val htmlTagPattern = """<[^>]+>""".r
val cleanText = htmlTagPattern.replaceAllIn(htmlText, "")
println(s"Cleaned HTML: $cleanText")
// URL conversion to links
val urlPattern = """https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?""".r
val textWithUrls = "Visit https://www.example.com or http://blog.example.org/post"
val linkedText = urlPattern.replaceAllIn(textWithUrls, m => s"<a href='${m.matched}'>${m.matched}</a>")
println(s"URL to links: $linkedText")
// Sensitive information masking
val creditCardPattern = """(\d{4})-(\d{4})-(\d{4})-(\d{4})""".r
val sensitiveText = "Credit card number: 1234-5678-9012-3456"
val maskedText = creditCardPattern.replaceAllIn(sensitiveText, "$1-****-****-$4")
println(s"Credit card masked: $maskedText")
// Date format conversion
val datePattern = """(\d{4})-(\d{2})-(\d{2})""".r
val dateText = "Meeting date: 2023-12-25"
val reformattedDate = datePattern.replaceAllIn(dateText, "$3/$2/$1")
println(s"Date format conversion: $reformattedDate")
// Multi-step replacement
def cleanAndFormat(text: String): String = {
val step1 = """[^\w\s]""".r.replaceAllIn(text, "") // Remove punctuation
val step2 = """\s+""".r.replaceAllIn(step1, " ") // Merge spaces
val step3 = step2.trim.toLowerCase // Convert to lowercase and trim
step3
}
val messyText = " Hello, World!!! How are you??? "
println(s"Cleaned and formatted: '${cleanAndFormat(messyText)}'")
}
}#Pattern Matching Integration
#Regex with Pattern Matching
object RegexPatternMatching {
// Define various patterns
val EmailPattern = """(\w+)@(\w+\.\w+)""".r
val PhonePattern = """(\d{3})-(\d{3})-(\d{4})""".r
val DatePattern = """(\d{4})-(\d{2})-(\d{2})""".r
val TimePattern = """(\d{2}):(\d{2}):(\d{2})""".r
val IpPattern = """(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})""".r
def classifyInput(input: String): String = input.trim match {
case EmailPattern(user, domain) =>
s"Email address: Username=$user, Domain=$domain"
case PhonePattern(area, exchange, number) =>
s"Phone number: Area=$area, Exchange=$exchange, Number=$number"
case DatePattern(year, month, day) =>
s"Date: $year year, $month month, $day day"
case TimePattern(hour, minute, second) =>
s"Time: $hour:$minute:$second"
case IpPattern(a, b, c, d) =>
s"IP address: $a.$b.$c.$d"
case _ =>
s"Unrecognized format: $input"
}
def validateAndExtract(input: String): Option[Map[String, String]] = input match {
case EmailPattern(user, domain) =>
Some(Map("type" -> "email", "user" -> user, "domain" -> domain))
case PhonePattern(area, exchange, number) =>
Some(Map("type" -> "phone", "area" -> area, "exchange" -> exchange, "number" -> number))
case DatePattern(year, month, day) if isValidDate(year.toInt, month.toInt, day.toInt) =>
Some(Map("type" -> "date", "year" -> year, "month" -> month, "day" -> day))
case _ => None
}
def isValidDate(year: Int, month: Int, day: Int): Boolean = {
month >= 1 && month <= 12 && day >= 1 && day <= 31
}
def main(args: Array[String]): Unit = {
val inputs = List(
"john@example.com",
"555-123-4567",
"2023-12-25",
"14:30:45",
"192.168.1.1",
"invalid-input",
"2023-13-45" // Invalid date
)
println("Input classification:")
inputs.foreach(input => println(s"$input -> ${classifyInput(input)}"))
println("\nValidation and extraction:")
inputs.foreach { input =>
validateAndExtract(input) match {
case Some(data) => println(s"$input -> valid: $data")
case None => println(s"$input -> invalid")
}
}
}
}#Practical Application Examples
#Log Analyzer
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
object LogAnalyzer {
// Different log pattern types
val ApacheLogPattern = """(\S+) \S+ \S+ \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+) (\d{3}) (\d+)""".r
val ApplicationLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (\w+): (.+)""".r
val ErrorLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ERROR (.+) - (.+)""".r
case class ApacheLogEntry(
ip: String,
timestamp: String,
method: String,
url: String,
protocol: String,
status: Int,
size: Int
)
case class ApplicationLogEntry(
timestamp: String,
level: String,
logger: String,
message: String
)
case class ErrorLogEntry(
timestamp: String,
location: String,
message: String
)
def parseLogLine(line: String): Option[Any] = line match {
case ApacheLogPattern(ip, timestamp, method, url, protocol, status, size) =>
Some(ApacheLogEntry(ip, timestamp, method, url, protocol, status.toInt, size.toInt))
case ApplicationLogPattern(timestamp, level, logger, message) =>
Some(ApplicationLogEntry(timestamp, level, logger, message))
case ErrorLogPattern(timestamp, location, message) =>
Some(ErrorLogEntry(timestamp, location, message))
case _ => None
}
def analyzeApacheLogs(logs: List[ApacheLogEntry]): Unit = {
println("Apache log analysis:")
// Status code statistics
val statusCounts = logs.groupBy(_.status).view.mapValues(_.size).toMap
println(s"Status code distribution: $statusCounts")
// Most accessed URLs
val urlCounts = logs.groupBy(_.url).view.mapValues(_.size).toMap
val topUrls = urlCounts.toSeq.sortBy(-_._2).take(5)
println("Most accessed URLs:")
topUrls.foreach { case (url, count) => println(s" $url: $count times") }
// IP address statistics
val ipCounts = logs.groupBy(_.ip).view.mapValues(_.size).toMap
val topIps = ipCounts.toSeq.sortBy(-_._2).take(5)
println("Most active IPs:")
topIps.foreach { case (ip, count) => println(s" $ip: $count times") }
}
def main(args: Array[String]): Unit = {
val sampleLogs = List(
"""192.168.1.1 - - [25/Dec/2023:10:00:00 +0000] "GET /index.html HTTP/1.1" 200 1234""",
"""192.168.1.2 - - [25/Dec/2023:10:01:00 +0000] "POST /api/users HTTP/1.1" 201 567""",
"""2023-12-25 10:00:00 [INFO] UserService: User login successful""",
"""2023-12-25 10:01:00 ERROR DatabaseConnection - Connection timeout""",
"""192.168.1.1 - - [25/Dec/2023:10:02:00 +0000] "GET /about.html HTTP/1.1" 200 890"""
)
val parsedLogs = sampleLogs.flatMap(parseLogLine)
println("Parsed log entries:")
parsedLogs.foreach(println)
// Analyze Apache logs
val apacheLogs = parsedLogs.collect { case log: ApacheLogEntry => log }
if (apacheLogs.nonEmpty) {
analyzeApacheLogs(apacheLogs)
}
}
}#Text Processor
object TextProcessor {
// Various text patterns
val UrlPattern = """https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?""".r
val EmailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
val HashtagPattern = """#\w+""".r
val MentionPattern = """@\w+""".r
val PhonePattern = """\b\d{3}-\d{3}-\d{4}\b""".r
case class TextAnalysis(
urls: List[String],
emails: List[String],
hashtags: List[String],
mentions: List[String],
phones: List[String],
wordCount: Int,
characterCount: Int
)
def analyzeText(text: String): TextAnalysis = {
TextAnalysis(
urls = UrlPattern.findAllIn(text).toList,
emails = EmailPattern.findAllIn(text).toList,
hashtags = HashtagPattern.findAllIn(text).toList,
mentions = MentionPattern.findAllIn(text).toList,
phones = PhonePattern.findAllIn(text).toList,
wordCount = """\b\w+\b""".r.findAllIn(text).length,
characterCount = text.length
)
}
def extractQuotes(text: String): List[String] = {
val quotePattern = """"([^"]+)"""".r
quotePattern.findAllMatchIn(text).map(_.group(1)).toList
}
def highlightKeywords(text: String, keywords: List[String]): String = {
keywords.foldLeft(text) { (result, keyword) =>
val pattern = s"(?i)\\b$keyword\\b".r
pattern.replaceAllIn(result, m => s"**${m.matched}**")
}
}
def censorProfanity(text: String, profanityList: List[String]): String = {
profanityList.foldLeft(text) { (result, word) =>
val pattern = s"(?i)\\b$word\\b".r
pattern.replaceAllIn(result, "*" * word.length)
}
}
def extractCodeBlocks(text: String): List[String] = {
val codeBlockPattern = """```(\w+)?\n(.*?)\n```""".r
codeBlockPattern.findAllMatchIn(text).map(_.group(2)).toList
}
def formatMarkdown(text: String): String = {
var result = text
// Bold
result = """\*\*([^*]+)\*\*""".r.replaceAllIn(result, "<strong>$1</strong>")
// Italic
result = """\*([^*]+)\*""".r.replaceAllIn(result, "<em>$1</em>")
// Links
result = """\[([^\]]+)\]\(([^)]+)\)""".r.replaceAllIn(result, "<a href='$2'>$1</a>")
// Code
result = """`([^`]+)`""".r.replaceAllIn(result, "<code>$1</code>")
result
}
def main(args: Array[String]): Unit = {
val sampleText = """
Check my website https://www.example.com or send email to contact@example.com
Follow me @username and use tag #scala #programming
Phone: 555-123-4567
This is a quoted text
**bold text** and *italic text*
[link text](https://link.com)
`code snippet`
"""
// Text analysis
val analysis = analyzeText(sampleText)
println("Text analysis results:")
println(s"URLs: ${analysis.urls}")
println(s"Emails: ${analysis.emails}")
println(s"Hashtags: ${analysis.hashtags}")
println(s"Mentions: ${analysis.mentions}")
println(s"Phones: ${analysis.phones}")
println(s"Word count: ${analysis.wordCount}")
println(s"Character count: ${analysis.characterCount}")
// Extract quotes
val quotes = extractQuotes(sampleText)
println(s"\nQuotes content: $quotes")
// Keyword highlighting
val highlighted = highlightKeywords(sampleText, List("scala", "programming"))
println(s"\nKeyword highlights:\n$highlighted")
// Markdown formatting
val formatted = formatMarkdown(sampleText)
println(s"\nMarkdown formatted:\n$formatted")
}
}#Data Validator
object DataValidator {
// Validation patterns
val EmailPattern = """^[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}$""".r
val PhonePattern = """^\d{3}-\d{3}-\d{4}$""".r
val ZipCodePattern = """^\d{5}(-\d{4})?$""".r
val CreditCardPattern = """^\d{4}-\d{4}-\d{4}-\d{4}$""".r
val PasswordPattern = """^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$""".r
val UsernamePattern = """^[a-zA-Z0-9_]{3,20}$""".r
val UrlPattern = """^https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?$""".r
sealed trait ValidationResult
case object Valid extends ValidationResult
case class Invalid(message: String) extends ValidationResult
case class ValidationRule(name: String, pattern: Regex, errorMessage: String)
val validationRules = Map(
"email" -> ValidationRule("Email", EmailPattern, "Email format invalid"),
"phone" -> ValidationRule("Phone", PhonePattern, "Phone format should be XXX-XXX-XXXX"),
"zipcode" -> ValidationRule("Zip code", ZipCodePattern, "Zip code format invalid"),
"creditcard" -> ValidationRule("Credit card", CreditCardPattern, "Credit card format should be XXXX-XXXX-XXXX-XXXX"),
"password" -> ValidationRule("Password", PasswordPattern, "Password must contain uppercase letters, lowercase, numbers and special characters, at least 8 characters"),
"username" -> ValidationRule("Username", UsernamePattern, "Username can only contain letters, numbers and underscores, 3-20 characters"),
"url" -> ValidationRule("URL", UrlPattern, "URL format invalid")
)
def validate(fieldType: String, value: String): ValidationResult = {
validationRules.get(fieldType) match {
case Some(rule) =>
if (rule.pattern.matches(value)) Valid
else Invalid(rule.errorMessage)
case None =>
Invalid(s"Unknown field type: $fieldType")
}
}
def validateMultiple(data: Map[String, String]): Map[String, ValidationResult] = {
data.map { case (fieldType, value) =>
fieldType -> validate(fieldType, value)
}
}
// Custom validators
def validateAge(ageStr: String): ValidationResult = {
val agePattern = """^\d+$""".r
ageStr match {
case agePattern() =>
val age = ageStr.toInt
if (age >= 0 && age <= 150) Valid
else Invalid("Age must be between 0 and 150")
case _ =>
Invalid("Age must be a number")
}
}
def validateDate(dateStr: String): ValidationResult = {
val datePattern = """^(\d{4})-(\d{2})-(\d{2})$""".r
dateStr match {
case datePattern(year, month, day) =>
val y = year.toInt
val m = month.toInt
val d = day.toInt
if (m >= 1 && m <= 12 && d >= 1 && d <= 31 && y >= 1900 && y <= 2100) {
Valid
} else {
Invalid("Date values outside valid range")
}
case _ =>
Invalid("Date format should be YYYY-MM-DD")
}
}
def main(args: Array[String]): Unit = {
val testData = Map(
"email" -> "user@example.com",
"phone" -> "555-123-4567",
"zipcode" -> "12345",
"creditcard" -> "1234-5678-9012-3456",
"password" -> "MyPass123!",
"username" -> "user_123",
"url" -> "https://www.example.com"
)
val invalidData = Map(
"email" -> "invalid-email",
"phone" -> "123456789",
"zipcode" -> "abc",
"password" -> "weak",
"username" -> "invalid"
)
println("Valid data validation:")
validateMultiple(testData).foreach { case (field, result) =>
println(s"$field: $result")
}
println("\nInvalid data validation:")
validateMultiple(invalidData).foreach { case (field, result) =>
println(s"$field: $result")
}
println("\nCustom validation:")
println(s"Age '25': ${validateAge("25")}")
println(s"Age '200': ${validateAge("200")}")
println(s"Date '2023-12-25': ${validateDate("2023-12-25")}")
println(s"Date '2023-13-45': ${validateDate("2023-13-45")}")
}
}#Performance Optimization
#Compiling and Caching
object RegexPerformance {
// Pre-compile regex patterns
val EmailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
val PhonePattern = """\d{3}-\d{3}-\d{4}""".r
// Avoid recompiling in loops
def inefficientMatching(texts: List[String]): List[String] = {
texts.filter { text =>
// Recompiles every time - inefficient
"""[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r.findFirstIn(text).isDefined
}
}
def efficientMatching(texts: List[String]): List[String] = {
texts.filter { text =>
// Use pre-compiled pattern - efficient
EmailPattern.findFirstIn(text).isDefined
}
}
// Batch processing optimization
def batchProcess(texts: List[String]): Map[String, List[String]] = {
val emailTexts = scala.collection.mutable.ListBuffer[String]()
val phoneTexts = scala.collection.mutable.ListBuffer[String]()
val otherTexts = scala.collection.mutable.ListBuffer[String]()
texts.foreach { text =>
if (EmailPattern.findFirstIn(text).isDefined) {
emailTexts += text
} else if (PhonePattern.findFirstIn(text).isDefined) {
phoneTexts += text
} else {
otherTexts += text
}
}
Map(
"emails" -> emailTexts.toList,
"phones" -> phoneTexts.toList,
"others" -> otherTexts.toList
)
}
def benchmarkRegex(): Unit = {
val testTexts = List.fill(10000)("user@example.com") ++
List.fill(10000)("555-123-4567") ++
List.fill(10000)("random text")
def timeOperation[T](name: String)(operation: => T): T = {
val start = System.nanoTime()
val result = operation
val end = System.nanoTime()
println(f"$name%20s: ${(end - start) / 1000000}%6d ms")
result
}
println("Regex performance tests:")
timeOperation("Inefficient matching") {
inefficientMatching(testTexts.take(1000))
}
timeOperation("Efficient matching") {
efficientMatching(testTexts.take(1000))
}
timeOperation("Batch processing") {
batchProcess(testTexts)
}
}
def main(args: Array[String]): Unit = {
benchmarkRegex()
}
}#Best Practices
-
Pre-compile regular expressions:
- Avoid recompiling in loops
- Use
.rmethod ornew Regex() - Define commonly used patterns as constants
-
Use raw strings:
- Use triple quotes to avoid escaping
- Improve readability and maintainability
- Reduce complexity of escape characters
-
Use capture groups appropriately:
- Only use capture groups when needed
- Use non-capturing groups
(?:...)for better performance - Consider named capture groups for readability
-
Pattern matching integration:
- Leverage Scala's pattern matching features
- Use
unapplyandunapplySeq - Create custom extractors
-
Performance considerations:
- Avoid overly complex regular expressions
- Consider using multiple simple patterns instead of one complex pattern
- Performance test when processing large volumes of data
Regular expressions are powerful tools for text processing, and their integration with pattern matching in Scala makes them more elegant and powerful.