Scala Regular Expressions
Regular expressions are powerful tools for processing text and string matching. Scala provides rich regex support, including pattern matching integration and convenient APIs.
Regex Basics
Creating Regular Expressions
scala
import scala.util.matching.Regex
object RegexBasics {
def main(args: Array[String]): Unit = {
// Different ways to create regex patterns
val pattern1: Regex = "\\d+".r // Use .r method
val pattern2: Regex = new Regex("\\d+") // Use constructor
val pattern3: Regex = """(\d{4})-(\d{2})-(\d{2})""".r // Raw string
// Regex with flags
val caseInsensitive: Regex = "(?i)hello".r
val multiline: Regex = "(?m)^start".r
println("Regex creation successful")
// Basic matching
val text = "The year 2023 has 365 days"
val numberPattern = "\\d+".r
// Find first match
val firstMatch = numberPattern.findFirstIn(text)
println(s"First number: ${firstMatch.getOrElse("not found")}")
// Find all matches
val allMatches = numberPattern.findAllIn(text).toList
println(s"All numbers: $allMatches")
// Check if matches
val hasNumbers = numberPattern.findFirstIn(text).isDefined
println(s"Contains numbers: $hasNumbers")
}
}Basic Matching Operations
scala
object BasicMatching {
def main(args: Array[String]): Unit = {
val emailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
val phonePattern = """\d{3}-\d{3}-\d{4}""".r
val datePattern = """\d{4}-\d{2}-\d{2}""".r
val text = """
Contact information:
Email: john.doe@example.com, alice@company.org
Phone: 555-123-4567, 555-987-6543
Date: 2023-12-25, 2024-01-01
"""
// Find all emails
val emails = emailPattern.findAllIn(text).toList
println(s"Email addresses: ${emails.mkString(", ")}")
// Find all phone numbers
val phones = phonePattern.findAllIn(text).toList
println(s"Phone numbers: ${phones.mkString(", ")}")
// Find all dates
val dates = datePattern.findAllIn(text).toList
println(s"Dates: ${dates.mkString(", ")}")
// Use findAllMatchIn for more info
println("\nDetailed match information:")
emailPattern.findAllMatchIn(text).foreach { m =>
println(s"Email: '${m.matched}' position: ${m.start}-${m.end}")
}
}
}Capture Groups
Basic Capture Groups
scala
object CaptureGroups {
def main(args: Array[String]): Unit = {
// Define regex patterns with capture groups
val namePattern = """(\w+)\s+(\w+)""".r // First and last name
val emailPattern = """(\w+)@(\w+\.\w+)""".r // Username and domain
val datePattern = """(\d{4})-(\d{2})-(\d{2})""".r // Year, month, day
val text = "John Smith's email is john@example.com, registration date is 2023-12-25"
// Extract names
namePattern.findFirstMatchIn("John Smith") match {
case Some(m) =>
println(s"First name: ${m.group(1)}")
println(s"Last name: ${m.group(2)}")
println(s"Full match: ${m.group(0)}")
case None =>
println("Name pattern not found")
}
// Extract email info
emailPattern.findFirstMatchIn(text) match {
case Some(m) =>
println(s"Username: ${m.group(1)}")
println(s"Domain: ${m.group(2)}")
case None =>
println("Email not found")
}
// Extract date info
datePattern.findFirstMatchIn(text) match {
case Some(m) =>
println(s"Year: ${m.group(1)}")
println(s"Month: ${m.group(2)}")
println(s"Day: ${m.group(3)}")
case None =>
println("Date pattern not found")
}
// Use unapplySeq for pattern matching
val dateString = "2023-12-25"
dateString match {
case datePattern(year, month, day) =>
println(s"Pattern match - Year: $year, Month: $month, Day: $day")
case _ =>
println("Date format doesn't match")
}
}
}Named Capture Groups
scala
object NamedCaptureGroups {
def main(args: Array[String]): Unit = {
// Java-style named capture groups (Scala 2.13+)
val logPattern = """(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?<level>\w+)\] (?<message>.+)""".r
val logEntry = "2023-12-25 10:30:45 [INFO] Application started successfully"
logPattern.findFirstMatchIn(logEntry) match {
case Some(m) =>
// Access groups by name
println(s"Timestamp: ${m.group("timestamp")}")
println(s"Level: ${m.group("level")}")
println(s"Message: ${m.group("message")}")
case None =>
println("Log format doesn't match")
}
// Alternative: Use regular capture groups and case class
case class LogEntry(timestamp: String, level: String, message: String)
val simpleLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)""".r
def parseLogEntry(log: String): Option[LogEntry] = log match {
case simpleLogPattern(timestamp, level, message) =>
Some(LogEntry(timestamp, level, message))
case _ => None
}
parseLogEntry(logEntry) match {
case Some(entry) =>
println(s"Parsed log: $entry")
case None =>
println("Unable to parse log")
}
}
}String Replacement
Basic Replacement Operations
scala
object StringReplacement {
def main(args: Array[String]): Unit = {
val text = "The quick brown fox jumps over the lazy dog. The fox is quick."
// Simple replacement
val pattern1 = "fox".r
val replaced1 = pattern1.replaceAllIn(text, "cat")
println(s"Replace fox -> cat: $replaced1")
// Replace first match only
val replaced2 = pattern1.replaceFirstIn(text, "wolf")
println(s"Replace first fox -> wolf: $replaced2")
// Use capture groups for replacement
val phonePattern = """(\d{3})-(\d{3})-(\d{4})""".r
val phoneText = "Phone numbers: 555-123-4567 and 555-987-6543"
val formattedPhones = phonePattern.replaceAllIn(phoneText, "($1) $2-$3")
println(s"Formatted phones: $formattedPhones")
// Use function for replacement
val numberPattern = """\d+""".r
val numberText = "I have 5 apples and 10 oranges"
val doubledNumbers = numberPattern.replaceAllIn(numberText, m => (m.matched.toInt * 2).toString)
println(s"Numbers doubled: $doubledNumbers")
// Conditional replacement
val wordPattern = """\b\w+\b""".r
val conditionalReplace = wordPattern.replaceAllIn(text, m =>
if (m.matched.length > 4) m.matched.toUpperCase else m.matched
)
println(s"Long words uppercase: $conditionalReplace")
}
}Advanced Replacement Techniques
scala
object AdvancedReplacement {
def main(args: Array[String]): Unit = {
// HTML tag cleanup
val htmlText = "<p>This is an <strong>important</strong> <em>message</em>.</p>"
val htmlTagPattern = """<[^>]+>""".r
val cleanText = htmlTagPattern.replaceAllIn(htmlText, "")
println(s"Cleaned HTML: $cleanText")
// URL conversion to links
val urlPattern = """https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?""".r
val textWithUrls = "Visit https://www.example.com or http://blog.example.org/post"
val linkedText = urlPattern.replaceAllIn(textWithUrls, m => s"<a href='${m.matched}'>${m.matched}</a>")
println(s"URL to links: $linkedText")
// Sensitive information masking
val creditCardPattern = """(\d{4})-(\d{4})-(\d{4})-(\d{4})""".r
val sensitiveText = "Credit card number: 1234-5678-9012-3456"
val maskedText = creditCardPattern.replaceAllIn(sensitiveText, "$1-****-****-$4")
println(s"Credit card masked: $maskedText")
// Date format conversion
val datePattern = """(\d{4})-(\d{2})-(\d{2})""".r
val dateText = "Meeting date: 2023-12-25"
val reformattedDate = datePattern.replaceAllIn(dateText, "$3/$2/$1")
println(s"Date format conversion: $reformattedDate")
// Multi-step replacement
def cleanAndFormat(text: String): String = {
val step1 = """[^\w\s]""".r.replaceAllIn(text, "") // Remove punctuation
val step2 = """\s+""".r.replaceAllIn(step1, " ") // Merge spaces
val step3 = step2.trim.toLowerCase // Convert to lowercase and trim
step3
}
val messyText = " Hello, World!!! How are you??? "
println(s"Cleaned and formatted: '${cleanAndFormat(messyText)}'")
}
}Pattern Matching Integration
Regex with Pattern Matching
scala
object RegexPatternMatching {
// Define various patterns
val EmailPattern = """(\w+)@(\w+\.\w+)""".r
val PhonePattern = """(\d{3})-(\d{3})-(\d{4})""".r
val DatePattern = """(\d{4})-(\d{2})-(\d{2})""".r
val TimePattern = """(\d{2}):(\d{2}):(\d{2})""".r
val IpPattern = """(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})""".r
def classifyInput(input: String): String = input.trim match {
case EmailPattern(user, domain) =>
s"Email address: Username=$user, Domain=$domain"
case PhonePattern(area, exchange, number) =>
s"Phone number: Area=$area, Exchange=$exchange, Number=$number"
case DatePattern(year, month, day) =>
s"Date: $year year, $month month, $day day"
case TimePattern(hour, minute, second) =>
s"Time: $hour:$minute:$second"
case IpPattern(a, b, c, d) =>
s"IP address: $a.$b.$c.$d"
case _ =>
s"Unrecognized format: $input"
}
def validateAndExtract(input: String): Option[Map[String, String]] = input match {
case EmailPattern(user, domain) =>
Some(Map("type" -> "email", "user" -> user, "domain" -> domain))
case PhonePattern(area, exchange, number) =>
Some(Map("type" -> "phone", "area" -> area, "exchange" -> exchange, "number" -> number))
case DatePattern(year, month, day) if isValidDate(year.toInt, month.toInt, day.toInt) =>
Some(Map("type" -> "date", "year" -> year, "month" -> month, "day" -> day))
case _ => None
}
def isValidDate(year: Int, month: Int, day: Int): Boolean = {
month >= 1 && month <= 12 && day >= 1 && day <= 31
}
def main(args: Array[String]): Unit = {
val inputs = List(
"john@example.com",
"555-123-4567",
"2023-12-25",
"14:30:45",
"192.168.1.1",
"invalid-input",
"2023-13-45" // Invalid date
)
println("Input classification:")
inputs.foreach(input => println(s"$input -> ${classifyInput(input)}"))
println("\nValidation and extraction:")
inputs.foreach { input =>
validateAndExtract(input) match {
case Some(data) => println(s"$input -> valid: $data")
case None => println(s"$input -> invalid")
}
}
}
}Practical Application Examples
Log Analyzer
scala
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
object LogAnalyzer {
// Different log pattern types
val ApacheLogPattern = """(\S+) \S+ \S+ \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+) (\d{3}) (\d+)""".r
val ApplicationLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (\w+): (.+)""".r
val ErrorLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ERROR (.+) - (.+)""".r
case class ApacheLogEntry(
ip: String,
timestamp: String,
method: String,
url: String,
protocol: String,
status: Int,
size: Int
)
case class ApplicationLogEntry(
timestamp: String,
level: String,
logger: String,
message: String
)
case class ErrorLogEntry(
timestamp: String,
location: String,
message: String
)
def parseLogLine(line: String): Option[Any] = line match {
case ApacheLogPattern(ip, timestamp, method, url, protocol, status, size) =>
Some(ApacheLogEntry(ip, timestamp, method, url, protocol, status.toInt, size.toInt))
case ApplicationLogPattern(timestamp, level, logger, message) =>
Some(ApplicationLogEntry(timestamp, level, logger, message))
case ErrorLogPattern(timestamp, location, message) =>
Some(ErrorLogEntry(timestamp, location, message))
case _ => None
}
def analyzeApacheLogs(logs: List[ApacheLogEntry]): Unit = {
println("Apache log analysis:")
// Status code statistics
val statusCounts = logs.groupBy(_.status).view.mapValues(_.size).toMap
println(s"Status code distribution: $statusCounts")
// Most accessed URLs
val urlCounts = logs.groupBy(_.url).view.mapValues(_.size).toMap
val topUrls = urlCounts.toSeq.sortBy(-_._2).take(5)
println("Most accessed URLs:")
topUrls.foreach { case (url, count) => println(s" $url: $count times") }
// IP address statistics
val ipCounts = logs.groupBy(_.ip).view.mapValues(_.size).toMap
val topIps = ipCounts.toSeq.sortBy(-_._2).take(5)
println("Most active IPs:")
topIps.foreach { case (ip, count) => println(s" $ip: $count times") }
}
def main(args: Array[String]): Unit = {
val sampleLogs = List(
"""192.168.1.1 - - [25/Dec/2023:10:00:00 +0000] "GET /index.html HTTP/1.1" 200 1234""",
"""192.168.1.2 - - [25/Dec/2023:10:01:00 +0000] "POST /api/users HTTP/1.1" 201 567""",
"""2023-12-25 10:00:00 [INFO] UserService: User login successful""",
"""2023-12-25 10:01:00 ERROR DatabaseConnection - Connection timeout""",
"""192.168.1.1 - - [25/Dec/2023:10:02:00 +0000] "GET /about.html HTTP/1.1" 200 890"""
)
val parsedLogs = sampleLogs.flatMap(parseLogLine)
println("Parsed log entries:")
parsedLogs.foreach(println)
// Analyze Apache logs
val apacheLogs = parsedLogs.collect { case log: ApacheLogEntry => log }
if (apacheLogs.nonEmpty) {
analyzeApacheLogs(apacheLogs)
}
}
}Text Processor
scala
object TextProcessor {
// Various text patterns
val UrlPattern = """https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?""".r
val EmailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
val HashtagPattern = """#\w+""".r
val MentionPattern = """@\w+""".r
val PhonePattern = """\b\d{3}-\d{3}-\d{4}\b""".r
case class TextAnalysis(
urls: List[String],
emails: List[String],
hashtags: List[String],
mentions: List[String],
phones: List[String],
wordCount: Int,
characterCount: Int
)
def analyzeText(text: String): TextAnalysis = {
TextAnalysis(
urls = UrlPattern.findAllIn(text).toList,
emails = EmailPattern.findAllIn(text).toList,
hashtags = HashtagPattern.findAllIn(text).toList,
mentions = MentionPattern.findAllIn(text).toList,
phones = PhonePattern.findAllIn(text).toList,
wordCount = """\b\w+\b""".r.findAllIn(text).length,
characterCount = text.length
)
}
def extractQuotes(text: String): List[String] = {
val quotePattern = """"([^"]+)"""".r
quotePattern.findAllMatchIn(text).map(_.group(1)).toList
}
def highlightKeywords(text: String, keywords: List[String]): String = {
keywords.foldLeft(text) { (result, keyword) =>
val pattern = s"(?i)\\b$keyword\\b".r
pattern.replaceAllIn(result, m => s"**${m.matched}**")
}
}
def censorProfanity(text: String, profanityList: List[String]): String = {
profanityList.foldLeft(text) { (result, word) =>
val pattern = s"(?i)\\b$word\\b".r
pattern.replaceAllIn(result, "*" * word.length)
}
}
def extractCodeBlocks(text: String): List[String] = {
val codeBlockPattern = """```(\w+)?\n(.*?)\n```""".r
codeBlockPattern.findAllMatchIn(text).map(_.group(2)).toList
}
def formatMarkdown(text: String): String = {
var result = text
// Bold
result = """\*\*([^*]+)\*\*""".r.replaceAllIn(result, "<strong>$1</strong>")
// Italic
result = """\*([^*]+)\*""".r.replaceAllIn(result, "<em>$1</em>")
// Links
result = """\[([^\]]+)\]\(([^)]+)\)""".r.replaceAllIn(result, "<a href='$2'>$1</a>")
// Code
result = """`([^`]+)`""".r.replaceAllIn(result, "<code>$1</code>")
result
}
def main(args: Array[String]): Unit = {
val sampleText = """
Check my website https://www.example.com or send email to contact@example.com
Follow me @username and use tag #scala #programming
Phone: 555-123-4567
This is a quoted text
**bold text** and *italic text*
[link text](https://link.com)
`code snippet`
"""
// Text analysis
val analysis = analyzeText(sampleText)
println("Text analysis results:")
println(s"URLs: ${analysis.urls}")
println(s"Emails: ${analysis.emails}")
println(s"Hashtags: ${analysis.hashtags}")
println(s"Mentions: ${analysis.mentions}")
println(s"Phones: ${analysis.phones}")
println(s"Word count: ${analysis.wordCount}")
println(s"Character count: ${analysis.characterCount}")
// Extract quotes
val quotes = extractQuotes(sampleText)
println(s"\nQuotes content: $quotes")
// Keyword highlighting
val highlighted = highlightKeywords(sampleText, List("scala", "programming"))
println(s"\nKeyword highlights:\n$highlighted")
// Markdown formatting
val formatted = formatMarkdown(sampleText)
println(s"\nMarkdown formatted:\n$formatted")
}
}Data Validator
scala
object DataValidator {
// Validation patterns
val EmailPattern = """^[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}$""".r
val PhonePattern = """^\d{3}-\d{3}-\d{4}$""".r
val ZipCodePattern = """^\d{5}(-\d{4})?$""".r
val CreditCardPattern = """^\d{4}-\d{4}-\d{4}-\d{4}$""".r
val PasswordPattern = """^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$""".r
val UsernamePattern = """^[a-zA-Z0-9_]{3,20}$""".r
val UrlPattern = """^https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?$""".r
sealed trait ValidationResult
case object Valid extends ValidationResult
case class Invalid(message: String) extends ValidationResult
case class ValidationRule(name: String, pattern: Regex, errorMessage: String)
val validationRules = Map(
"email" -> ValidationRule("Email", EmailPattern, "Email format invalid"),
"phone" -> ValidationRule("Phone", PhonePattern, "Phone format should be XXX-XXX-XXXX"),
"zipcode" -> ValidationRule("Zip code", ZipCodePattern, "Zip code format invalid"),
"creditcard" -> ValidationRule("Credit card", CreditCardPattern, "Credit card format should be XXXX-XXXX-XXXX-XXXX"),
"password" -> ValidationRule("Password", PasswordPattern, "Password must contain uppercase letters, lowercase, numbers and special characters, at least 8 characters"),
"username" -> ValidationRule("Username", UsernamePattern, "Username can only contain letters, numbers and underscores, 3-20 characters"),
"url" -> ValidationRule("URL", UrlPattern, "URL format invalid")
)
def validate(fieldType: String, value: String): ValidationResult = {
validationRules.get(fieldType) match {
case Some(rule) =>
if (rule.pattern.matches(value)) Valid
else Invalid(rule.errorMessage)
case None =>
Invalid(s"Unknown field type: $fieldType")
}
}
def validateMultiple(data: Map[String, String]): Map[String, ValidationResult] = {
data.map { case (fieldType, value) =>
fieldType -> validate(fieldType, value)
}
}
// Custom validators
def validateAge(ageStr: String): ValidationResult = {
val agePattern = """^\d+$""".r
ageStr match {
case agePattern() =>
val age = ageStr.toInt
if (age >= 0 && age <= 150) Valid
else Invalid("Age must be between 0 and 150")
case _ =>
Invalid("Age must be a number")
}
}
def validateDate(dateStr: String): ValidationResult = {
val datePattern = """^(\d{4})-(\d{2})-(\d{2})$""".r
dateStr match {
case datePattern(year, month, day) =>
val y = year.toInt
val m = month.toInt
val d = day.toInt
if (m >= 1 && m <= 12 && d >= 1 && d <= 31 && y >= 1900 && y <= 2100) {
Valid
} else {
Invalid("Date values outside valid range")
}
case _ =>
Invalid("Date format should be YYYY-MM-DD")
}
}
def main(args: Array[String]): Unit = {
val testData = Map(
"email" -> "user@example.com",
"phone" -> "555-123-4567",
"zipcode" -> "12345",
"creditcard" -> "1234-5678-9012-3456",
"password" -> "MyPass123!",
"username" -> "user_123",
"url" -> "https://www.example.com"
)
val invalidData = Map(
"email" -> "invalid-email",
"phone" -> "123456789",
"zipcode" -> "abc",
"password" -> "weak",
"username" -> "invalid"
)
println("Valid data validation:")
validateMultiple(testData).foreach { case (field, result) =>
println(s"$field: $result")
}
println("\nInvalid data validation:")
validateMultiple(invalidData).foreach { case (field, result) =>
println(s"$field: $result")
}
println("\nCustom validation:")
println(s"Age '25': ${validateAge("25")}")
println(s"Age '200': ${validateAge("200")}")
println(s"Date '2023-12-25': ${validateDate("2023-12-25")}")
println(s"Date '2023-13-45': ${validateDate("2023-13-45")}")
}
}Performance Optimization
Compiling and Caching
scala
object RegexPerformance {
// Pre-compile regex patterns
val EmailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
val PhonePattern = """\d{3}-\d{3}-\d{4}""".r
// Avoid recompiling in loops
def inefficientMatching(texts: List[String]): List[String] = {
texts.filter { text =>
// Recompiles every time - inefficient
"""[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r.findFirstIn(text).isDefined
}
}
def efficientMatching(texts: List[String]): List[String] = {
texts.filter { text =>
// Use pre-compiled pattern - efficient
EmailPattern.findFirstIn(text).isDefined
}
}
// Batch processing optimization
def batchProcess(texts: List[String]): Map[String, List[String]] = {
val emailTexts = scala.collection.mutable.ListBuffer[String]()
val phoneTexts = scala.collection.mutable.ListBuffer[String]()
val otherTexts = scala.collection.mutable.ListBuffer[String]()
texts.foreach { text =>
if (EmailPattern.findFirstIn(text).isDefined) {
emailTexts += text
} else if (PhonePattern.findFirstIn(text).isDefined) {
phoneTexts += text
} else {
otherTexts += text
}
}
Map(
"emails" -> emailTexts.toList,
"phones" -> phoneTexts.toList,
"others" -> otherTexts.toList
)
}
def benchmarkRegex(): Unit = {
val testTexts = List.fill(10000)("user@example.com") ++
List.fill(10000)("555-123-4567") ++
List.fill(10000)("random text")
def timeOperation[T](name: String)(operation: => T): T = {
val start = System.nanoTime()
val result = operation
val end = System.nanoTime()
println(f"$name%20s: ${(end - start) / 1000000}%6d ms")
result
}
println("Regex performance tests:")
timeOperation("Inefficient matching") {
inefficientMatching(testTexts.take(1000))
}
timeOperation("Efficient matching") {
efficientMatching(testTexts.take(1000))
}
timeOperation("Batch processing") {
batchProcess(testTexts)
}
}
def main(args: Array[String]): Unit = {
benchmarkRegex()
}
}Best Practices
Pre-compile regular expressions:
- Avoid recompiling in loops
- Use
.rmethod ornew Regex() - Define commonly used patterns as constants
Use raw strings:
- Use triple quotes to avoid escaping
- Improve readability and maintainability
- Reduce complexity of escape characters
Use capture groups appropriately:
- Only use capture groups when needed
- Use non-capturing groups
(?:...)for better performance - Consider named capture groups for readability
Pattern matching integration:
- Leverage Scala's pattern matching features
- Use
unapplyandunapplySeq - Create custom extractors
Performance considerations:
- Avoid overly complex regular expressions
- Consider using multiple simple patterns instead of one complex pattern
- Performance test when processing large volumes of data
Regular expressions are powerful tools for text processing, and their integration with pattern matching in Scala makes them more elegant and powerful.