Skip to content

Scala Regular Expressions

Regular expressions are powerful tools for processing text and string matching. Scala provides rich regex support, including pattern matching integration and convenient APIs.

Regex Basics

Creating Regular Expressions

scala
import scala.util.matching.Regex

object RegexBasics {
  def main(args: Array[String]): Unit = {
    // Different ways to create regex patterns
    val pattern1: Regex = "\\d+".r  // Use .r method
    val pattern2: Regex = new Regex("\\d+")  // Use constructor
    val pattern3: Regex = """(\d{4})-(\d{2})-(\d{2})""".r  // Raw string

    // Regex with flags
    val caseInsensitive: Regex = "(?i)hello".r
    val multiline: Regex = "(?m)^start".r

    println("Regex creation successful")

    // Basic matching
    val text = "The year 2023 has 365 days"
    val numberPattern = "\\d+".r

    // Find first match
    val firstMatch = numberPattern.findFirstIn(text)
    println(s"First number: ${firstMatch.getOrElse("not found")}")

    // Find all matches
    val allMatches = numberPattern.findAllIn(text).toList
    println(s"All numbers: $allMatches")

    // Check if matches
    val hasNumbers = numberPattern.findFirstIn(text).isDefined
    println(s"Contains numbers: $hasNumbers")
  }
}

Basic Matching Operations

scala
object BasicMatching {
  def main(args: Array[String]): Unit = {
    val emailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
    val phonePattern = """\d{3}-\d{3}-\d{4}""".r
    val datePattern = """\d{4}-\d{2}-\d{2}""".r

    val text = """
      Contact information:
      Email: john.doe@example.com, alice@company.org
      Phone: 555-123-4567, 555-987-6543
      Date: 2023-12-25, 2024-01-01
    """

    // Find all emails
    val emails = emailPattern.findAllIn(text).toList
    println(s"Email addresses: ${emails.mkString(", ")}")

    // Find all phone numbers
    val phones = phonePattern.findAllIn(text).toList
    println(s"Phone numbers: ${phones.mkString(", ")}")

    // Find all dates
    val dates = datePattern.findAllIn(text).toList
    println(s"Dates: ${dates.mkString(", ")}")

    // Use findAllMatchIn for more info
    println("\nDetailed match information:")
    emailPattern.findAllMatchIn(text).foreach { m =>
      println(s"Email: '${m.matched}' position: ${m.start}-${m.end}")
    }
  }
}

Capture Groups

Basic Capture Groups

scala
object CaptureGroups {
  def main(args: Array[String]): Unit = {
    // Define regex patterns with capture groups
    val namePattern = """(\w+)\s+(\w+)""".r  // First and last name
    val emailPattern = """(\w+)@(\w+\.\w+)""".r  // Username and domain
    val datePattern = """(\d{4})-(\d{2})-(\d{2})""".r  // Year, month, day

    val text = "John Smith's email is john@example.com, registration date is 2023-12-25"

    // Extract names
    namePattern.findFirstMatchIn("John Smith") match {
      case Some(m) =>
        println(s"First name: ${m.group(1)}")
        println(s"Last name: ${m.group(2)}")
        println(s"Full match: ${m.group(0)}")
      case None =>
        println("Name pattern not found")
    }

    // Extract email info
    emailPattern.findFirstMatchIn(text) match {
      case Some(m) =>
        println(s"Username: ${m.group(1)}")
        println(s"Domain: ${m.group(2)}")
      case None =>
        println("Email not found")
    }

    // Extract date info
    datePattern.findFirstMatchIn(text) match {
      case Some(m) =>
        println(s"Year: ${m.group(1)}")
        println(s"Month: ${m.group(2)}")
        println(s"Day: ${m.group(3)}")
      case None =>
        println("Date pattern not found")
    }

    // Use unapplySeq for pattern matching
    val dateString = "2023-12-25"
    dateString match {
      case datePattern(year, month, day) =>
        println(s"Pattern match - Year: $year, Month: $month, Day: $day")
      case _ =>
        println("Date format doesn't match")
    }
  }
}

Named Capture Groups

scala
object NamedCaptureGroups {
  def main(args: Array[String]): Unit = {
    // Java-style named capture groups (Scala 2.13+)
    val logPattern = """(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?<level>\w+)\] (?<message>.+)""".r

    val logEntry = "2023-12-25 10:30:45 [INFO] Application started successfully"

    logPattern.findFirstMatchIn(logEntry) match {
      case Some(m) =>
        // Access groups by name
        println(s"Timestamp: ${m.group("timestamp")}")
        println(s"Level: ${m.group("level")}")
        println(s"Message: ${m.group("message")}")
      case None =>
        println("Log format doesn't match")
    }

    // Alternative: Use regular capture groups and case class
    case class LogEntry(timestamp: String, level: String, message: String)

    val simpleLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)""".r

    def parseLogEntry(log: String): Option[LogEntry] = log match {
      case simpleLogPattern(timestamp, level, message) =>
        Some(LogEntry(timestamp, level, message))
      case _ => None
    }

    parseLogEntry(logEntry) match {
      case Some(entry) =>
        println(s"Parsed log: $entry")
      case None =>
        println("Unable to parse log")
    }
  }
}

String Replacement

Basic Replacement Operations

scala
object StringReplacement {
  def main(args: Array[String]): Unit = {
    val text = "The quick brown fox jumps over the lazy dog. The fox is quick."

    // Simple replacement
    val pattern1 = "fox".r
    val replaced1 = pattern1.replaceAllIn(text, "cat")
    println(s"Replace fox -> cat: $replaced1")

    // Replace first match only
    val replaced2 = pattern1.replaceFirstIn(text, "wolf")
    println(s"Replace first fox -> wolf: $replaced2")

    // Use capture groups for replacement
    val phonePattern = """(\d{3})-(\d{3})-(\d{4})""".r
    val phoneText = "Phone numbers: 555-123-4567 and 555-987-6543"
    val formattedPhones = phonePattern.replaceAllIn(phoneText, "($1) $2-$3")
    println(s"Formatted phones: $formattedPhones")

    // Use function for replacement
    val numberPattern = """\d+""".r
    val numberText = "I have 5 apples and 10 oranges"
    val doubledNumbers = numberPattern.replaceAllIn(numberText, m => (m.matched.toInt * 2).toString)
    println(s"Numbers doubled: $doubledNumbers")

    // Conditional replacement
    val wordPattern = """\b\w+\b""".r
    val conditionalReplace = wordPattern.replaceAllIn(text, m =>
      if (m.matched.length > 4) m.matched.toUpperCase else m.matched
    )
    println(s"Long words uppercase: $conditionalReplace")
  }
}

Advanced Replacement Techniques

scala
object AdvancedReplacement {
  def main(args: Array[String]): Unit = {
    // HTML tag cleanup
    val htmlText = "<p>This is an <strong>important</strong> <em>message</em>.</p>"
    val htmlTagPattern = """<[^>]+>""".r
    val cleanText = htmlTagPattern.replaceAllIn(htmlText, "")
    println(s"Cleaned HTML: $cleanText")

    // URL conversion to links
    val urlPattern = """https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?""".r
    val textWithUrls = "Visit https://www.example.com or http://blog.example.org/post"
    val linkedText = urlPattern.replaceAllIn(textWithUrls, m => s"<a href='${m.matched}'>${m.matched}</a>")
    println(s"URL to links: $linkedText")

    // Sensitive information masking
    val creditCardPattern = """(\d{4})-(\d{4})-(\d{4})-(\d{4})""".r
    val sensitiveText = "Credit card number: 1234-5678-9012-3456"
    val maskedText = creditCardPattern.replaceAllIn(sensitiveText, "$1-****-****-$4")
    println(s"Credit card masked: $maskedText")

    // Date format conversion
    val datePattern = """(\d{4})-(\d{2})-(\d{2})""".r
    val dateText = "Meeting date: 2023-12-25"
    val reformattedDate = datePattern.replaceAllIn(dateText, "$3/$2/$1")
    println(s"Date format conversion: $reformattedDate")

    // Multi-step replacement
    def cleanAndFormat(text: String): String = {
      val step1 = """[^\w\s]""".r.replaceAllIn(text, "")  // Remove punctuation
      val step2 = """\s+""".r.replaceAllIn(step1, " ")     // Merge spaces
      val step3 = step2.trim.toLowerCase                    // Convert to lowercase and trim
      step3
    }

    val messyText = "  Hello,   World!!!   How are you???  "
    println(s"Cleaned and formatted: '${cleanAndFormat(messyText)}'")
  }
}

Pattern Matching Integration

Regex with Pattern Matching

scala
object RegexPatternMatching {
  // Define various patterns
  val EmailPattern = """(\w+)@(\w+\.\w+)""".r
  val PhonePattern = """(\d{3})-(\d{3})-(\d{4})""".r
  val DatePattern = """(\d{4})-(\d{2})-(\d{2})""".r
  val TimePattern = """(\d{2}):(\d{2}):(\d{2})""".r
  val IpPattern = """(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})""".r

  def classifyInput(input: String): String = input.trim match {
    case EmailPattern(user, domain) =>
      s"Email address: Username=$user, Domain=$domain"

    case PhonePattern(area, exchange, number) =>
      s"Phone number: Area=$area, Exchange=$exchange, Number=$number"

    case DatePattern(year, month, day) =>
      s"Date: $year year, $month month, $day day"

    case TimePattern(hour, minute, second) =>
      s"Time: $hour:$minute:$second"

    case IpPattern(a, b, c, d) =>
      s"IP address: $a.$b.$c.$d"

    case _ =>
      s"Unrecognized format: $input"
  }

  def validateAndExtract(input: String): Option[Map[String, String]] = input match {
    case EmailPattern(user, domain) =>
      Some(Map("type" -> "email", "user" -> user, "domain" -> domain))

    case PhonePattern(area, exchange, number) =>
      Some(Map("type" -> "phone", "area" -> area, "exchange" -> exchange, "number" -> number))

    case DatePattern(year, month, day) if isValidDate(year.toInt, month.toInt, day.toInt) =>
      Some(Map("type" -> "date", "year" -> year, "month" -> month, "day" -> day))

    case _ => None
  }

  def isValidDate(year: Int, month: Int, day: Int): Boolean = {
    month >= 1 && month <= 12 && day >= 1 && day <= 31
  }

  def main(args: Array[String]): Unit = {
    val inputs = List(
      "john@example.com",
      "555-123-4567",
      "2023-12-25",
      "14:30:45",
      "192.168.1.1",
      "invalid-input",
      "2023-13-45"  // Invalid date
    )

    println("Input classification:")
    inputs.foreach(input => println(s"$input -> ${classifyInput(input)}"))

    println("\nValidation and extraction:")
    inputs.foreach { input =>
      validateAndExtract(input) match {
        case Some(data) => println(s"$input -> valid: $data")
        case None => println(s"$input -> invalid")
      }
    }
  }
}

Practical Application Examples

Log Analyzer

scala
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter

object LogAnalyzer {
  // Different log pattern types
  val ApacheLogPattern = """(\S+) \S+ \S+ \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+) (\d{3}) (\d+)""".r
  val ApplicationLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (\w+): (.+)""".r
  val ErrorLogPattern = """(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ERROR (.+) - (.+)""".r

  case class ApacheLogEntry(
    ip: String,
    timestamp: String,
    method: String,
    url: String,
    protocol: String,
    status: Int,
    size: Int
  )

  case class ApplicationLogEntry(
    timestamp: String,
    level: String,
    logger: String,
    message: String
  )

  case class ErrorLogEntry(
    timestamp: String,
    location: String,
    message: String
  )

  def parseLogLine(line: String): Option[Any] = line match {
    case ApacheLogPattern(ip, timestamp, method, url, protocol, status, size) =>
      Some(ApacheLogEntry(ip, timestamp, method, url, protocol, status.toInt, size.toInt))

    case ApplicationLogPattern(timestamp, level, logger, message) =>
      Some(ApplicationLogEntry(timestamp, level, logger, message))

    case ErrorLogPattern(timestamp, location, message) =>
      Some(ErrorLogEntry(timestamp, location, message))

    case _ => None
  }

  def analyzeApacheLogs(logs: List[ApacheLogEntry]): Unit = {
    println("Apache log analysis:")

    // Status code statistics
    val statusCounts = logs.groupBy(_.status).view.mapValues(_.size).toMap
    println(s"Status code distribution: $statusCounts")

    // Most accessed URLs
    val urlCounts = logs.groupBy(_.url).view.mapValues(_.size).toMap
    val topUrls = urlCounts.toSeq.sortBy(-_._2).take(5)
    println("Most accessed URLs:")
    topUrls.foreach { case (url, count) => println(s"  $url: $count times") }

    // IP address statistics
    val ipCounts = logs.groupBy(_.ip).view.mapValues(_.size).toMap
    val topIps = ipCounts.toSeq.sortBy(-_._2).take(5)
    println("Most active IPs:")
    topIps.foreach { case (ip, count) => println(s"  $ip: $count times") }
  }

  def main(args: Array[String]): Unit = {
    val sampleLogs = List(
      """192.168.1.1 - - [25/Dec/2023:10:00:00 +0000] "GET /index.html HTTP/1.1" 200 1234""",
      """192.168.1.2 - - [25/Dec/2023:10:01:00 +0000] "POST /api/users HTTP/1.1" 201 567""",
      """2023-12-25 10:00:00 [INFO] UserService: User login successful""",
      """2023-12-25 10:01:00 ERROR DatabaseConnection - Connection timeout""",
      """192.168.1.1 - - [25/Dec/2023:10:02:00 +0000] "GET /about.html HTTP/1.1" 200 890"""
    )

    val parsedLogs = sampleLogs.flatMap(parseLogLine)

    println("Parsed log entries:")
    parsedLogs.foreach(println)

    // Analyze Apache logs
    val apacheLogs = parsedLogs.collect { case log: ApacheLogEntry => log }
    if (apacheLogs.nonEmpty) {
      analyzeApacheLogs(apacheLogs)
    }
  }
}

Text Processor

scala
object TextProcessor {
  // Various text patterns
  val UrlPattern = """https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?""".r
  val EmailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
  val HashtagPattern = """#\w+""".r
  val MentionPattern = """@\w+""".r
  val PhonePattern = """\b\d{3}-\d{3}-\d{4}\b""".r

  case class TextAnalysis(
    urls: List[String],
    emails: List[String],
    hashtags: List[String],
    mentions: List[String],
    phones: List[String],
    wordCount: Int,
    characterCount: Int
  )

  def analyzeText(text: String): TextAnalysis = {
    TextAnalysis(
      urls = UrlPattern.findAllIn(text).toList,
      emails = EmailPattern.findAllIn(text).toList,
      hashtags = HashtagPattern.findAllIn(text).toList,
      mentions = MentionPattern.findAllIn(text).toList,
      phones = PhonePattern.findAllIn(text).toList,
      wordCount = """\b\w+\b""".r.findAllIn(text).length,
      characterCount = text.length
    )
  }

  def extractQuotes(text: String): List[String] = {
    val quotePattern = """"([^"]+)"""".r
    quotePattern.findAllMatchIn(text).map(_.group(1)).toList
  }

  def highlightKeywords(text: String, keywords: List[String]): String = {
    keywords.foldLeft(text) { (result, keyword) =>
      val pattern = s"(?i)\\b$keyword\\b".r
      pattern.replaceAllIn(result, m => s"**${m.matched}**")
    }
  }

  def censorProfanity(text: String, profanityList: List[String]): String = {
    profanityList.foldLeft(text) { (result, word) =>
      val pattern = s"(?i)\\b$word\\b".r
      pattern.replaceAllIn(result, "*" * word.length)
    }
  }

  def extractCodeBlocks(text: String): List[String] = {
    val codeBlockPattern = """```(\w+)?\n(.*?)\n```""".r
    codeBlockPattern.findAllMatchIn(text).map(_.group(2)).toList
  }

  def formatMarkdown(text: String): String = {
    var result = text

    // Bold
    result = """\*\*([^*]+)\*\*""".r.replaceAllIn(result, "<strong>$1</strong>")

    // Italic
    result = """\*([^*]+)\*""".r.replaceAllIn(result, "<em>$1</em>")

    // Links
    result = """\[([^\]]+)\]\(([^)]+)\)""".r.replaceAllIn(result, "<a href='$2'>$1</a>")

    // Code
    result = """`([^`]+)`""".r.replaceAllIn(result, "<code>$1</code>")

    result
  }

  def main(args: Array[String]): Unit = {
    val sampleText = """
      Check my website https://www.example.com or send email to contact@example.com
      Follow me @username and use tag #scala #programming
      Phone: 555-123-4567
      This is a quoted text
      **bold text** and *italic text*
      [link text](https://link.com)
      `code snippet`
      """

    // Text analysis
    val analysis = analyzeText(sampleText)
    println("Text analysis results:")
    println(s"URLs: ${analysis.urls}")
    println(s"Emails: ${analysis.emails}")
    println(s"Hashtags: ${analysis.hashtags}")
    println(s"Mentions: ${analysis.mentions}")
    println(s"Phones: ${analysis.phones}")
    println(s"Word count: ${analysis.wordCount}")
    println(s"Character count: ${analysis.characterCount}")

    // Extract quotes
    val quotes = extractQuotes(sampleText)
    println(s"\nQuotes content: $quotes")

    // Keyword highlighting
    val highlighted = highlightKeywords(sampleText, List("scala", "programming"))
    println(s"\nKeyword highlights:\n$highlighted")

    // Markdown formatting
    val formatted = formatMarkdown(sampleText)
    println(s"\nMarkdown formatted:\n$formatted")
  }
}

Data Validator

scala
object DataValidator {
  // Validation patterns
  val EmailPattern = """^[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}$""".r
  val PhonePattern = """^\d{3}-\d{3}-\d{4}$""".r
  val ZipCodePattern = """^\d{5}(-\d{4})?$""".r
  val CreditCardPattern = """^\d{4}-\d{4}-\d{4}-\d{4}$""".r
  val PasswordPattern = """^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$""".r
  val UsernamePattern = """^[a-zA-Z0-9_]{3,20}$""".r
  val UrlPattern = """^https?://[\w.-]+(?:/[\w._~:/?#[\]@!$&'()*+,;=-]*)?$""".r

  sealed trait ValidationResult
  case object Valid extends ValidationResult
  case class Invalid(message: String) extends ValidationResult

  case class ValidationRule(name: String, pattern: Regex, errorMessage: String)

  val validationRules = Map(
    "email" -> ValidationRule("Email", EmailPattern, "Email format invalid"),
    "phone" -> ValidationRule("Phone", PhonePattern, "Phone format should be XXX-XXX-XXXX"),
    "zipcode" -> ValidationRule("Zip code", ZipCodePattern, "Zip code format invalid"),
    "creditcard" -> ValidationRule("Credit card", CreditCardPattern, "Credit card format should be XXXX-XXXX-XXXX-XXXX"),
    "password" -> ValidationRule("Password", PasswordPattern, "Password must contain uppercase letters, lowercase, numbers and special characters, at least 8 characters"),
    "username" -> ValidationRule("Username", UsernamePattern, "Username can only contain letters, numbers and underscores, 3-20 characters"),
    "url" -> ValidationRule("URL", UrlPattern, "URL format invalid")
  )

  def validate(fieldType: String, value: String): ValidationResult = {
    validationRules.get(fieldType) match {
      case Some(rule) =>
        if (rule.pattern.matches(value)) Valid
        else Invalid(rule.errorMessage)
      case None =>
        Invalid(s"Unknown field type: $fieldType")
    }
  }

  def validateMultiple(data: Map[String, String]): Map[String, ValidationResult] = {
    data.map { case (fieldType, value) =>
      fieldType -> validate(fieldType, value)
    }
  }

  // Custom validators
  def validateAge(ageStr: String): ValidationResult = {
    val agePattern = """^\d+$""".r
    ageStr match {
      case agePattern() =>
        val age = ageStr.toInt
        if (age >= 0 && age <= 150) Valid
        else Invalid("Age must be between 0 and 150")
      case _ =>
        Invalid("Age must be a number")
    }
  }

  def validateDate(dateStr: String): ValidationResult = {
    val datePattern = """^(\d{4})-(\d{2})-(\d{2})$""".r
    dateStr match {
      case datePattern(year, month, day) =>
        val y = year.toInt
        val m = month.toInt
        val d = day.toInt

        if (m >= 1 && m <= 12 && d >= 1 && d <= 31 && y >= 1900 && y <= 2100) {
          Valid
        } else {
          Invalid("Date values outside valid range")
        }
      case _ =>
        Invalid("Date format should be YYYY-MM-DD")
    }
  }

  def main(args: Array[String]): Unit = {
    val testData = Map(
      "email" -> "user@example.com",
      "phone" -> "555-123-4567",
      "zipcode" -> "12345",
      "creditcard" -> "1234-5678-9012-3456",
      "password" -> "MyPass123!",
      "username" -> "user_123",
      "url" -> "https://www.example.com"
    )

    val invalidData = Map(
      "email" -> "invalid-email",
      "phone" -> "123456789",
      "zipcode" -> "abc",
      "password" -> "weak",
      "username" -> "invalid"
    )

    println("Valid data validation:")
    validateMultiple(testData).foreach { case (field, result) =>
      println(s"$field: $result")
    }

    println("\nInvalid data validation:")
    validateMultiple(invalidData).foreach { case (field, result) =>
      println(s"$field: $result")
    }

    println("\nCustom validation:")
    println(s"Age '25': ${validateAge("25")}")
    println(s"Age '200': ${validateAge("200")}")
    println(s"Date '2023-12-25': ${validateDate("2023-12-25")}")
    println(s"Date '2023-13-45': ${validateDate("2023-13-45")}")
  }
}

Performance Optimization

Compiling and Caching

scala
object RegexPerformance {
  // Pre-compile regex patterns
  val EmailPattern = """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r
  val PhonePattern = """\d{3}-\d{3}-\d{4}""".r

  // Avoid recompiling in loops
  def inefficientMatching(texts: List[String]): List[String] = {
    texts.filter { text =>
      // Recompiles every time - inefficient
      """[\w._%+-]+@[\w.-]+\.[A-Za-z]{2,}""".r.findFirstIn(text).isDefined
    }
  }

  def efficientMatching(texts: List[String]): List[String] = {
    texts.filter { text =>
      // Use pre-compiled pattern - efficient
      EmailPattern.findFirstIn(text).isDefined
    }
  }

  // Batch processing optimization
  def batchProcess(texts: List[String]): Map[String, List[String]] = {
    val emailTexts = scala.collection.mutable.ListBuffer[String]()
    val phoneTexts = scala.collection.mutable.ListBuffer[String]()
    val otherTexts = scala.collection.mutable.ListBuffer[String]()

    texts.foreach { text =>
      if (EmailPattern.findFirstIn(text).isDefined) {
        emailTexts += text
      } else if (PhonePattern.findFirstIn(text).isDefined) {
        phoneTexts += text
      } else {
        otherTexts += text
      }
    }

    Map(
      "emails" -> emailTexts.toList,
      "phones" -> phoneTexts.toList,
      "others" -> otherTexts.toList
    )
  }

  def benchmarkRegex(): Unit = {
    val testTexts = List.fill(10000)("user@example.com") ++
                    List.fill(10000)("555-123-4567") ++
                    List.fill(10000)("random text")

    def timeOperation[T](name: String)(operation: => T): T = {
      val start = System.nanoTime()
      val result = operation
      val end = System.nanoTime()
      println(f"$name%20s: ${(end - start) / 1000000}%6d ms")
      result
    }

    println("Regex performance tests:")

    timeOperation("Inefficient matching") {
      inefficientMatching(testTexts.take(1000))
    }

    timeOperation("Efficient matching") {
      efficientMatching(testTexts.take(1000))
    }

    timeOperation("Batch processing") {
      batchProcess(testTexts)
    }
  }

  def main(args: Array[String]): Unit = {
    benchmarkRegex()
  }
}

Best Practices

  1. Pre-compile regular expressions:

    • Avoid recompiling in loops
    • Use .r method or new Regex()
    • Define commonly used patterns as constants
  2. Use raw strings:

    • Use triple quotes to avoid escaping
    • Improve readability and maintainability
    • Reduce complexity of escape characters
  3. Use capture groups appropriately:

    • Only use capture groups when needed
    • Use non-capturing groups (?:...) for better performance
    • Consider named capture groups for readability
  4. Pattern matching integration:

    • Leverage Scala's pattern matching features
    • Use unapply and unapplySeq
    • Create custom extractors
  5. Performance considerations:

    • Avoid overly complex regular expressions
    • Consider using multiple simple patterns instead of one complex pattern
    • Performance test when processing large volumes of data

Regular expressions are powerful tools for text processing, and their integration with pattern matching in Scala makes them more elegant and powerful.

Content is for learning and research only.