Regular Expressions
Overview
Regular expressions are powerful tools for text processing, used for pattern matching, text searching, replacement, and validation. Kotlin provides complete regular expression support based on Java's Pattern and Matcher classes, while also offering a more concise Kotlin-style API.
Regular Expression Basics
Creating Regular Expressions
kotlin
fun main() {
println("=== Creating Regular Expressions ===")
// 1. Using Regex constructor
val regex1 = Regex("hello")
val regex2 = Regex("[0-9]+")
val regex3 = Regex("\\d{3}-\\d{3}-\\d{4}") // Phone number format
// 2. Using string's toRegex() extension function
val regex4 = "world".toRegex()
val regex5 = "[a-zA-Z]+".toRegex()
// 3. Using raw strings to avoid escaping
val regex6 = Regex("""^\d{4}-\d{2}-\d{2}$""") // Date format
val regex7 = Regex("""[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}""") // Email
// 4. Regular expressions with options
val regex8 = Regex("HELLO", RegexOption.IGNORE_CASE)
val regex9 = Regex("""
\d{3} # Area code
- # Separator
\d{3} # First three digits
- # Separator
\d{4} # Last four digits
""".trimIndent(), setOf(RegexOption.COMMENTS, RegexOption.IGNORE_CASE))
println("Regular expressions created successfully")
// Test basic matching
val testString = "Hello World 123"
println("Test string: '$testString'")
println("Contains 'hello' (case-insensitive): ${regex8.containsMatchIn(testString)}")
println("Contains digits: ${regex2.containsMatchIn(testString)}")
println("Contains letters: ${regex5.containsMatchIn(testString)}")
}Basic Matching Operations
kotlin
fun main() {
println("=== Basic Matching Operations ===")
val text = "Contact phone: 138-1234-5678, email: user@example.com, date: 2023-12-25"
// 1. Check if contains match
val phoneRegex = Regex("""\d{3}-\d{4}-\d{4}""")
val emailRegex = Regex("""[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}""")
val dateRegex = Regex("""\d{4}-\d{2}-\d{2}""")
println("Text: $text")
println("Contains phone number: ${phoneRegex.containsMatchIn(text)}")
println("Contains email: ${emailRegex.containsMatchIn(text)}")
println("Contains date: ${dateRegex.containsMatchIn(text)}")
// 2. Find first match
val phoneMatch = phoneRegex.find(text)
phoneMatch?.let { match ->
println("Found phone number: ${match.value}")
println("Position: ${match.range}")
}
val emailMatch = emailRegex.find(text)
emailMatch?.let { match ->
println("Found email: ${match.value}")
println("Start position: ${match.range.first}")
println("End position: ${match.range.last}")
}
// 3. Find all matches
val numberRegex = Regex("""\d+""")
val allNumbers = numberRegex.findAll(text)
println("All numbers:")
allNumbers.forEach { match ->
println(" ${match.value} at ${match.range}")
}
// 4. Full match
val phoneNumber = "138-1234-5678"
val fullPhoneRegex = Regex("""^\d{3}-\d{4}-\d{4}$""")
println("'$phoneNumber' is complete phone number: ${fullPhoneRegex.matches(phoneNumber)}")
println("'138-1234' is complete phone number: ${fullPhoneRegex.matches("138-1234")}")
// 5. Convert match results
val numbers = numberRegex.findAll(text)
.map { it.value.toInt() }
.toList()
println("Extracted numbers: $numbers")
}Regular Expression Patterns
Character Classes and Quantifiers
kotlin
fun main() {
println("=== Character Classes and Quantifiers ===")
val testStrings = listOf(
"abc123",
"ABC",
"123",
"hello@world.com",
"test_file.txt",
"special-chars!@#",
" spaces ",
"newline\nhere",
"tab\there"
)
// Character class patterns
val patterns = mapOf(
"Digits" to Regex("""\d+"""),
"Letters" to Regex("""[a-zA-Z]+"""),
"Alphanumeric" to Regex("""\w+"""),
"Lowercase" to Regex("""[a-z]+"""),
"Uppercase" to Regex("""[A-Z]+"""),
"Special chars" to Regex("""[!@#$%^&*()]+"""),
"Whitespace" to Regex("""\s+"""),
"Non-digits" to Regex("""\D+"""),
"Non-word" to Regex("""\W+"""),
"Email format" to Regex("""\w+@\w+\.\w+""")
)
testStrings.forEach { testString ->
println("Test string: '$testString'")
patterns.forEach { (name, regex) ->
val matches = regex.findAll(testString).map { it.value }.toList()
if (matches.isNotEmpty()) {
println(" $name: $matches")
}
}
println()
}
// Quantifier examples
println("=== Quantifier Examples ===")
val quantifierTests = mapOf(
"a?" to listOf("", "a", "aa", "aaa"),
"a*" to listOf("", "a", "aa", "aaa"),
"a+" to listOf("", "a", "aa", "aaa"),
"a{2}" to listOf("a", "aa", "aaa"),
"a{2,}" to listOf("a", "aa", "aaa", "aaaa"),
"a{1,3}" to listOf("", "a", "aa", "aaa", "aaaa")
)
quantifierTests.forEach { (pattern, tests) ->
println("Pattern: $pattern")
val regex = Regex("^$pattern$")
tests.forEach { test ->
val matches = regex.matches(test)
println(" '$test': $matches")
}
println()
}
}Groups and Capturing
kotlin
fun main() {
println("=== Groups and Capturing ===")
// 1. Basic grouping
val dateText = "Today is December 25, 2023, tomorrow is December 26, 2023"
val dateRegex = Regex("""(\d{4})-(\d{1,2})-(\d{1,2})""")
// Using different date format for English
val dateText2 = "Dates: 2023-12-25 and 2023-12-26"
println("Text: $dateText2")
println("Finding all dates:")
dateRegex.findAll(dateText2).forEach { match ->
println("Full match: ${match.value}")
println("Year: ${match.groupValues[1]}")
println("Month: ${match.groupValues[2]}")
println("Day: ${match.groupValues[3]}")
println("All groups: ${match.groupValues}")
println()
}
// 2. Named groups
val namedDateRegex = Regex("""(?<year>\d{4})-(?<month>\d{1,2})-(?<day>\d{1,2})""")
println("Using named groups:")
namedDateRegex.find(dateText2)?.let { match ->
println("Year: ${match.groups["year"]?.value}")
println("Month: ${match.groups["month"]?.value}")
println("Day: ${match.groups["day"]?.value}")
}
// 3. Phone number parsing
val phoneText = "Contact: +1-555-123-4567 or 800-555-9876"
val phoneRegex = Regex("""(?:(\+\d{1,3})-)?(\d{3})-(\d{3,8})-?(\d{4})?""")
println("Phone number parsing:")
phoneRegex.findAll(phoneText).forEach { match ->
val countryCode = match.groupValues[1].takeIf { it.isNotEmpty() } ?: "none"
val areaCode = match.groupValues[2]
val number = match.groupValues[3]
println("Full number: ${match.value}")
println("Country code: $countryCode")
println("Area code: $areaCode")
println("Number: $number")
println()
}
// 4. Email address parsing
val emailText = "Contacts: john.doe@company.com and admin@test.org"
val emailRegex = Regex("""([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})""")
println("Email address parsing:")
emailRegex.findAll(emailText).forEach { match ->
println("Full email: ${match.value}")
println("Username: ${match.groupValues[1]}")
println("Domain: ${match.groupValues[2]}")
println("TLD: ${match.groupValues[3]}")
println()
}
// 5. URL parsing
val urlText = "Visit https://www.example.com:8080/path/to/page?param=value#section"
val urlRegex = Regex("""(https?)://([^:/]+)(?::(\d+))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?""")
println("URL parsing:")
urlRegex.find(urlText)?.let { match ->
println("Full URL: ${match.value}")
println("Protocol: ${match.groupValues[1]}")
println("Host: ${match.groupValues[2]}")
println("Port: ${match.groupValues[3].takeIf { it.isNotEmpty() } ?: "default"}")
println("Path: ${match.groupValues[4]}")
println("Query params: ${match.groupValues[5]}")
println("Anchor: ${match.groupValues[6]}")
}
}Text Replacement and Processing
Basic Replacement Operations
kotlin
fun main() {
println("=== Text Replacement Operations ===")
val originalText = """
User Information:
Name: John Doe
Phone: 555-123-4567
Email: johndoe@example.com
Birthday: 1990-05-15
Address: 123 Main Street, New York
""".trimIndent()
println("Original text:")
println(originalText)
println()
// 1. Simple replacement
val hiddenPhoneText = originalText.replace(Regex("""\d{3}-\d{3}-\d{4}"""), "***-***-****")
println("Hidden phone number:")
println(hiddenPhoneText)
println()
// 2. Replacement using groups
val formattedDateText = originalText.replace(
Regex("""(\d{4})-(\d{2})-(\d{2})"""),
"$2/$3/$1" // MM/DD/YYYY format
)
println("Formatted date:")
println(formattedDateText)
println()
// 3. Using replacement function
val maskedEmailText = originalText.replace(Regex("""([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})""")) { match ->
val username = match.groupValues[1]
val domain = match.groupValues[2]
val maskedUsername = username.take(2) + "*".repeat(maxOf(0, username.length - 2))
"$maskedUsername@$domain"
}
println("Masked email:")
println(maskedEmailText)
println()
// 4. Complex replacement: format phone numbers
val phoneFormatText = "Phone numbers: 5551234567, 8001234567"
val formattedPhones = phoneFormatText.replace(
Regex("""(\d{3})(\d{3})(\d{4})""")
) { match ->
"${match.groupValues[1]}-${match.groupValues[2]}-${match.groupValues[3]}"
}
println("Formatted phone numbers:")
println("Original: $phoneFormatText")
println("Formatted: $formattedPhones")
println()
// 5. Clean and normalize text
val messyText = " Hello World! \n\n How are you? \t\t "
val cleanedText = messyText
.replace(Regex("""\s+"""), " ") // Replace multiple whitespace with single space
.trim() // Remove leading/trailing whitespace
println("Text cleaning:")
println("Original: '$messyText'")
println("Cleaned: '$cleanedText'")
}Data Validation
Common Validation Patterns
kotlin
class DataValidator {
companion object {
// Common regex patterns
val EMAIL_PATTERN = Regex("""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$""")
val PHONE_PATTERN = Regex("""^\d{3}-\d{3}-\d{4}$""") // US phone format
val PASSWORD_PATTERN = Regex("""^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$""")
val URL_PATTERN = Regex("""^https?://[^\s/$.?#].[^\s]*$""")
val IPV4_PATTERN = Regex("""^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$""")
val DATE_PATTERN = Regex("""^\d{4}-\d{2}-\d{2}$""")
val TIME_PATTERN = Regex("""^([01]?[0-9]|2[0-3]):[0-5][0-9]$""")
}
fun validateEmail(email: String): ValidationResult {
return if (EMAIL_PATTERN.matches(email)) {
ValidationResult.Success
} else {
ValidationResult.Error("Invalid email format")
}
}
fun validatePhone(phone: String): ValidationResult {
return if (PHONE_PATTERN.matches(phone)) {
ValidationResult.Success
} else {
ValidationResult.Error("Invalid phone format (use XXX-XXX-XXXX)")
}
}
fun validatePassword(password: String): ValidationResult {
return when {
password.length < 8 -> ValidationResult.Error("Password must be at least 8 characters")
!password.contains(Regex("[a-z]")) -> ValidationResult.Error("Password must contain lowercase letter")
!password.contains(Regex("[A-Z]")) -> ValidationResult.Error("Password must contain uppercase letter")
!password.contains(Regex("\\d")) -> ValidationResult.Error("Password must contain digit")
!password.contains(Regex("[@$!%*?&]")) -> ValidationResult.Error("Password must contain special character")
else -> ValidationResult.Success
}
}
fun validateUrl(url: String): ValidationResult {
return if (URL_PATTERN.matches(url)) {
ValidationResult.Success
} else {
ValidationResult.Error("Invalid URL format")
}
}
fun validateIPv4(ip: String): ValidationResult {
return if (IPV4_PATTERN.matches(ip)) {
ValidationResult.Success
} else {
ValidationResult.Error("Invalid IPv4 address format")
}
}
fun validateDate(date: String): ValidationResult {
if (!DATE_PATTERN.matches(date)) {
return ValidationResult.Error("Invalid date format, should be YYYY-MM-DD")
}
val parts = date.split("-")
val year = parts[0].toInt()
val month = parts[1].toInt()
val day = parts[2].toInt()
return when {
year < 1900 || year > 2100 -> ValidationResult.Error("Year should be between 1900-2100")
month < 1 || month > 12 -> ValidationResult.Error("Month should be between 1-12")
day < 1 || day > 31 -> ValidationResult.Error("Day should be between 1-31")
month == 2 && day > 29 -> ValidationResult.Error("February cannot have more than 29 days")
month == 2 && day == 29 && !isLeapYear(year) -> ValidationResult.Error("Non-leap year February cannot have 29 days")
(month == 4 || month == 6 || month == 9 || month == 11) && day > 30 ->
ValidationResult.Error("This month only has 30 days")
else -> ValidationResult.Success
}
}
private fun isLeapYear(year: Int): Boolean {
return (year % 4 == 0 && year % 100 != 0) || (year % 400 == 0)
}
sealed class ValidationResult {
object Success : ValidationResult()
data class Error(val message: String) : ValidationResult()
}
}
fun main() {
println("=== Data Validation Examples ===")
val validator = DataValidator()
// Test data
val testData = mapOf(
"Email" to listOf("user@example.com", "invalid-email", "test@domain"),
"Phone" to listOf("555-123-4567", "12345678901", "555-1234"),
"Password" to listOf("Password123!", "password", "PASSWORD123", "Pass123"),
"URL" to listOf("https://www.example.com", "http://test.org", "invalid-url"),
"IP Address" to listOf("192.168.1.1", "255.255.255.255", "256.1.1.1", "192.168.1"),
"Date" to listOf("2023-12-25", "2023-02-29", "2024-02-29", "2023-13-01")
)
testData.forEach { (type, values) ->
println("=== $type Validation ===")
values.forEach { value ->
val result = when (type) {
"Email" -> validator.validateEmail(value)
"Phone" -> validator.validatePhone(value)
"Password" -> validator.validatePassword(value)
"URL" -> validator.validateUrl(value)
"IP Address" -> validator.validateIPv4(value)
"Date" -> validator.validateDate(value)
else -> DataValidator.ValidationResult.Error("Unknown type")
}
val status = when (result) {
is DataValidator.ValidationResult.Success -> "✓ Valid"
is DataValidator.ValidationResult.Error -> "✗ ${result.message}"
}
println("$value: $status")
}
println()
}
}Practical Application Examples
Log Analyzer
kotlin
import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
data class LogEntry(
val timestamp: LocalDateTime,
val level: String,
val logger: String,
val message: String,
val thread: String? = null,
val exception: String? = null
)
class LogAnalyzer {
// Regex patterns for different log formats
private val logPatterns = mapOf(
"standard" to Regex("""(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(\w+)\s+\[([^\]]+)\]\s+(.+)"""),
"detailed" to Regex("""(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\s+(\w+)\s+(\d+)\s+---\s+\[([^\]]+)\]\s+([^:]+):\s+(.+)""")
)
private val dateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
fun parseLogFile(logContent: String): List<LogEntry> {
val entries = mutableListOf<LogEntry>()
val lines = logContent.lines()
for (line in lines) {
if (line.isBlank()) continue
val entry = parseLogLine(line)
if (entry != null) {
entries.add(entry)
}
}
return entries
}
private fun parseLogLine(line: String): LogEntry? {
// Try standard format
logPatterns["standard"]?.find(line)?.let { match ->
return LogEntry(
timestamp = LocalDateTime.parse(match.groupValues[1], dateTimeFormatter),
level = match.groupValues[2],
logger = match.groupValues[3],
message = match.groupValues[4]
)
}
return null
}
fun analyzeErrors(entries: List<LogEntry>): Map<String, Any> {
val errorEntries = entries.filter { it.level == "ERROR" }
val errorsByLogger = errorEntries.groupBy { it.logger }
val errorsByHour = errorEntries.groupBy { it.timestamp.hour }
// Extract exception types
val exceptionPattern = Regex("""([a-zA-Z.]+Exception)""")
val exceptionTypes = errorEntries.mapNotNull { entry ->
exceptionPattern.find(entry.message)?.groupValues?.get(1)
}.groupingBy { it }.eachCount()
return mapOf(
"totalErrors" to errorEntries.size,
"errorsByLogger" to errorsByLogger.mapValues { it.value.size },
"errorsByHour" to errorsByHour.mapValues { it.value.size },
"topExceptions" to exceptionTypes.toList().sortedByDescending { it.second }.take(5)
)
}
fun extractIpAddresses(entries: List<LogEntry>): Set<String> {
val ipPattern = Regex("""\b(?:\d{1,3}\.){3}\d{1,3}\b""")
return entries.flatMap { entry ->
ipPattern.findAll(entry.message).map { it.value }
}.toSet()
}
fun generateReport(entries: List<LogEntry>): String {
val errorAnalysis = analyzeErrors(entries)
val ipAddresses = extractIpAddresses(entries)
return buildString {
appendLine("=== Log Analysis Report ===")
appendLine("Total log entries: ${entries.size}")
appendLine("Time range: ${entries.minByOrNull { it.timestamp }?.timestamp} to ${entries.maxByOrNull { it.timestamp }?.timestamp}")
appendLine()
appendLine("=== Error Analysis ===")
appendLine("Total errors: ${errorAnalysis["totalErrors"]}")
@Suppress("UNCHECKED_CAST")
val errorsByLogger = errorAnalysis["errorsByLogger"] as Map<String, Int>
appendLine("Errors by logger:")
errorsByLogger.forEach { (logger, count) ->
appendLine(" $logger: $count")
}
@Suppress("UNCHECKED_CAST")
val topExceptions = errorAnalysis["topExceptions"] as List<Pair<String, Int>>
appendLine("Top exception types:")
topExceptions.forEach { (exception, count) ->
appendLine(" $exception: $count")
}
appendLine()
appendLine("=== IP Address Statistics ===")
appendLine("Unique IP addresses found: ${ipAddresses.size}")
ipAddresses.take(10).forEach { ip ->
appendLine(" $ip")
}
}
}
}
fun main() {
println("=== Log Analyzer Example ===")
// Simulated log content
val logContent = """
2023-12-25 10:30:15 INFO [com.example.UserService] User login successful: user123
2023-12-25 10:30:16 DEBUG [com.example.DatabaseConnection] Connection established to 192.168.1.100
2023-12-25 10:30:17 ERROR [com.example.PaymentService] Payment failed: java.lang.NullPointerException at line 45
2023-12-25 10:30:18 WARN [com.example.SecurityService] Multiple failed login attempts from 192.168.1.200
2023-12-25 10:30:19 ERROR [com.example.UserService] SQL injection attempt detected from 10.0.0.50
2023-12-25 10:30:20 INFO [com.example.OrderService] Order created successfully: order456
2023-12-25 10:30:21 ERROR [com.example.DatabaseConnection] Connection timeout: java.sql.SQLException
2023-12-25 10:30:22 WARN [com.example.SecurityService] Suspicious XSS attack pattern detected
2023-12-25 10:30:23 INFO [com.example.UserService] User logout: user123
2023-12-25 10:30:24 ERROR [com.example.PaymentService] Unauthorized access attempt from 172.16.0.10
""".trimIndent()
val analyzer = LogAnalyzer()
// Parse logs
val logEntries = analyzer.parseLogFile(logContent)
println("Parsed ${logEntries.size} log entries")
println()
// Generate analysis report
val report = analyzer.generateReport(logEntries)
println(report)
}Best Practices
1. Regex Design Principles
kotlin
// Good practice: clear, readable regular expressions
class RegexBestPractices {
companion object {
// Use named constants
val EMAIL_REGEX = Regex("""
^[a-zA-Z0-9._%+-]+ # Username part
@ # @ symbol
[a-zA-Z0-9.-]+ # Domain part
\. # Dot
[a-zA-Z]{2,}$ # Top-level domain
""".trimIndent(), RegexOption.COMMENTS)
// Decompose complex patterns
private val PHONE_AREA_CODE = """\d{3}"""
private val PHONE_EXCHANGE = """\d{3}"""
private val PHONE_NUMBER = """\d{4}"""
val PHONE_REGEX = Regex("^$PHONE_AREA_CODE-$PHONE_EXCHANGE-$PHONE_NUMBER$")
}
// Provide validation methods instead of exposing regex directly
fun isValidEmail(email: String): Boolean = EMAIL_REGEX.matches(email)
fun isValidPhone(phone: String): Boolean = PHONE_REGEX.matches(phone)
}2. Error Handling
kotlin
fun safeRegexOperation(pattern: String, text: String): Result<List<String>> {
return try {
val regex = Regex(pattern)
val matches = regex.findAll(text).map { it.value }.toList()
Result.success(matches)
} catch (e: Exception) {
Result.failure(e)
}
}3. Testing Regular Expressions
kotlin
class RegexTester {
fun testEmailRegex() {
val emailRegex = Regex("""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$""")
val validEmails = listOf(
"user@example.com",
"test.email@domain.org",
"user+tag@example.co.uk"
)
val invalidEmails = listOf(
"invalid-email",
"@domain.com",
"user@",
"user@domain"
)
println("=== Email Regex Test ===")
validEmails.forEach { email ->
val isValid = emailRegex.matches(email)
println("$email: ${if (isValid) "✓" else "✗"}")
assert(isValid) { "Should match: $email" }
}
invalidEmails.forEach { email ->
val isValid = emailRegex.matches(email)
println("$email: ${if (!isValid) "✓" else "✗"}")
assert(!isValid) { "Should not match: $email" }
}
}
}Next Steps
After mastering regular expressions, let's learn about Kotlin's standard library with its rich built-in functions and utility classes.
Next Chapter: Standard Library
Exercises
- Write a text processing tool that supports extracting and converting various data formats
- Create a form validation system that validates various user input formats
- Implement a simple template engine that supports variable replacement and conditional rendering
- Design a code formatting tool that uses regex to beautify code
- Develop a log monitoring system that analyzes log files in real-time and alerts on anomalous patterns