rock3r/MavenIndexAnalyzer.main.kts

## MavenIndexAnalyzer.main.kts
@file:Repository("https://repo1.maven.org/maven2/")
@file:DependsOn("org.apache.maven.indexer:indexer-reader:7.1.5")
@file:DependsOn("org.slf4j:slf4j-simple:2.0.13")

import org.apache.maven.index.reader.ChunkReader
import java.io.*
import java.net.URL
import java.nio.channels.Channels
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.Executors
import java.util.concurrent.TimeUnit
import java.util.concurrent.atomic.AtomicLong
import java.util.zip.GZIPInputStream
import kotlin.system.exitProcess

val INDEX_URL = "https://maven-central-eu.storage-download.googleapis.com/maven2/.index/nexus-maven-repository-index.gz"
val OUTPUT_FILE = "versions.csv"
val TEMP_INDEX_FILE = "nexus-maven-repository-index.gz"

// SemVer 2.0.0 Regex (Official)
val STRICT_SEMVER_REGEX = Regex("""^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$""") // Corrected escaping for backslashes within the regex string

// Lenient: Starts with digit, contains at least one dot followed by digit.
// Examples: 1.0, 1.0.0.Final, 1.0-beta
val LENIENT_REGEX = Regex("^\\d+(\\.\\d+)+.*$") // Corrected escaping for backslashes within the regex string

fun main() {
    val indexFile = File(TEMP_INDEX_FILE)

    if (!indexFile.exists() || indexFile.length() == 0L) {
        println("Downloading index from $INDEX_URL...")
        try {
            val connection = URL(INDEX_URL).openConnection()
            val fileSize = connection.contentLengthLong
            connection.getInputStream().use { input ->
                FileOutputStream(indexFile).use { output ->
                    val buffer = ByteArray(8192)
                    var bytesRead: Int
                    var totalBytesRead = 0L
                    var lastReportTime = 0L

                    while (input.read(buffer).also { bytesRead = it } != -1) {
                        output.write(buffer, 0, bytesRead)
                        totalBytesRead += bytesRead

                        val currentTime = System.currentTimeMillis()
                        if (currentTime - lastReportTime > 500) { // Update every 500ms
                            val progress = if (fileSize > 0) "%.2f%%".format(totalBytesRead.toDouble() / fileSize * 100) else ""
                            val mbRead = totalBytesRead.toDouble() / (1024 * 1024)
                            print("\rDownloaded %.2f MB %s".format(mbRead, progress))
                            lastReportTime = currentTime
                        }
                    }
                }
            }
            println("\rDownload complete.                                         ")
        } catch (e: Exception) {
            System.err.println("\nFailed to download index: ${e.message}")
            indexFile.delete() // Clean up partial file
            exitProcess(1)
        }
    } else {
        println("Using existing index file: ${indexFile.absolutePath} (${indexFile.length() / (1024 * 1024)} MB)")
    }

    println("Processing index...")
    val versionCounts = ConcurrentHashMap<String, AtomicLong>()
    val totalProcessed = AtomicLong(0)

    try {
        FileInputStream(indexFile).use { fileIn ->
            // Verify magic number for GZIP
            if (indexFile.length() > 2) {
                // Peek strictly checks headers without consuming if we used a separate stream,
                // but here we just rely on GZIPInputStream failing if it's wrong.
            }

            ChunkReader("index", fileIn).use { reader ->

                // Using a chunked approach or just iterating if the reader supports it stream-wise
                // IndexDataReader.readIndex returns an Iterable<Record>

                val nThreads = Runtime.getRuntime().availableProcessors()
                val executor = Executors.newFixedThreadPool(nThreads)
                val BATCH_SIZE = 10000
                var batch = ArrayList<String>(BATCH_SIZE)

                for (record in reader) {
                    if (totalProcessed.get() < 5) {
                        println("\nDebug Record: $record")
                    }
                    var version = record["v"]
                    if (version == null) {
                        val u = record["u"]
                        if (u != null) {
                            val pipe1 = u.indexOf('|')
                            if (pipe1 != -1) {
                                val pipe2 = u.indexOf('|', pipe1 + 1)
                                if (pipe2 != -1) {
                                    var pipe3 = u.indexOf('|', pipe2 + 1)
                                    if (pipe3 == -1) pipe3 = u.length
                                    version = u.substring(pipe2 + 1, pipe3)
                                }
                            }
                        }
                    }

                    if (version != null) {
                        batch.add(version)
                        if (batch.size >= BATCH_SIZE) {
                            val batchToProcess = batch
                            executor.submit {
                                val localCounts = HashMap<String, Long>()
                                for (v in batchToProcess) {
                                    localCounts[v] = (localCounts[v] ?: 0L) + 1L
                                }
                                for ((v, count) in localCounts) {
                                    versionCounts.computeIfAbsent(v) { AtomicLong(0) }.addAndGet(count)
                                }
                            }
                            batch = ArrayList(BATCH_SIZE)
                        }
                    }

                    val count = totalProcessed.incrementAndGet()
                    if (count % 10_000L == 0L) {
                        print("\rProcessed $count artifacts...")
                    }
                }

                // Process remaining
                if (batch.isNotEmpty()) {
                    executor.submit {
                        val localCounts = HashMap<String, Long>()
                        for (v in batch) {
                            localCounts[v] = (localCounts[v] ?: 0L) + 1L
                        }
                        for ((v, count) in localCounts) {
                            versionCounts.computeIfAbsent(v) { AtomicLong(0) }.addAndGet(count)
                        }
                    }
                }

                executor.shutdown()
                executor.awaitTermination(1, TimeUnit.HOURS)
            }
        }
        println("\rProcessed ${totalProcessed.get()} artifacts total.         ")
    } catch (e: Exception) {
        System.err.println("\nError processing index: ${e.message}")
        e.printStackTrace()
        exitProcess(1)
    }

    println("\nProcessing complete. Found ${versionCounts.size} unique versions.")
    println("Writing results to $OUTPUT_FILE...")

    File(OUTPUT_FILE).printWriter().use { writer ->
        writer.println("Version,Frequency,SemVer Compliance")

        versionCounts.entries.sortedByDescending { it.value.get() }.forEach { entry ->
            val version = entry.key
            val count = entry.value.get()
            val compliance = when {
                STRICT_SEMVER_REGEX.matches(version) -> "Strict"
                LENIENT_REGEX.matches(version) -> "Lenient"
                else -> "Not at all"
            }
            // Escape CSV injection or commas if necessary
            val safeVersion = if (version.contains(',')) "\"$version\"" else version
            writer.println("$safeVersion,$count,$compliance")
        }
    }

    println("Done.")
}

main()
	@file:Repository("https://repo1.maven.org/maven2/")
	@file:DependsOn("org.apache.maven.indexer:indexer-reader:7.1.5")
	@file:DependsOn("org.slf4j:slf4j-simple:2.0.13")

	import org.apache.maven.index.reader.ChunkReader
	import java.io.*
	import java.net.URL
	import java.nio.channels.Channels
	import java.util.concurrent.ConcurrentHashMap
	import java.util.concurrent.Executors
	import java.util.concurrent.TimeUnit
	import java.util.concurrent.atomic.AtomicLong
	import java.util.zip.GZIPInputStream
	import kotlin.system.exitProcess

	val INDEX_URL = "https://maven-central-eu.storage-download.googleapis.com/maven2/.index/nexus-maven-repository-index.gz"
	val OUTPUT_FILE = "versions.csv"
	val TEMP_INDEX_FILE = "nexus-maven-repository-index.gz"

	// SemVer 2.0.0 Regex (Official)
	val STRICT_SEMVER_REGEX = Regex("""^(0\|[1-9]\d)\.(0\|[1-9]\d)\.(0\|[1-9]\d)(?:-((?:0\|[1-9]\d\|\d[a-zA-Z-][0-9a-zA-Z-])(?:\.(?:0\|[1-9]\d\|\d[a-zA-Z-][0-9a-zA-Z-]))))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$""") // Corrected escaping for backslashes within the regex string

	// Lenient: Starts with digit, contains at least one dot followed by digit.
	// Examples: 1.0, 1.0.0.Final, 1.0-beta
	val LENIENT_REGEX = Regex("^\\d+(\\.\\d+)+.*$") // Corrected escaping for backslashes within the regex string

	fun main() {
	val indexFile = File(TEMP_INDEX_FILE)

	if (!indexFile.exists() \|\| indexFile.length() == 0L) {
	println("Downloading index from $INDEX_URL...")
	try {
	val connection = URL(INDEX_URL).openConnection()
	val fileSize = connection.contentLengthLong
	connection.getInputStream().use { input ->
	FileOutputStream(indexFile).use { output ->
	val buffer = ByteArray(8192)
	var bytesRead: Int
	var totalBytesRead = 0L
	var lastReportTime = 0L

	while (input.read(buffer).also { bytesRead = it } != -1) {
	output.write(buffer, 0, bytesRead)
	totalBytesRead += bytesRead

	val currentTime = System.currentTimeMillis()
	if (currentTime - lastReportTime > 500) { // Update every 500ms
	val progress = if (fileSize > 0) "%.2f%%".format(totalBytesRead.toDouble() / fileSize * 100) else ""
	val mbRead = totalBytesRead.toDouble() / (1024 * 1024)
	print("\rDownloaded %.2f MB %s".format(mbRead, progress))
	lastReportTime = currentTime
	}
	}
	}
	}
	println("\rDownload complete. ")
	} catch (e: Exception) {
	System.err.println("\nFailed to download index: ${e.message}")
	indexFile.delete() // Clean up partial file
	exitProcess(1)
	}
	} else {
	println("Using existing index file: ${indexFile.absolutePath} (${indexFile.length() / (1024 * 1024)} MB)")
	}

	println("Processing index...")
	val versionCounts = ConcurrentHashMap<String, AtomicLong>()
	val totalProcessed = AtomicLong(0)

	try {
	FileInputStream(indexFile).use { fileIn ->
	// Verify magic number for GZIP
	if (indexFile.length() > 2) {
	// Peek strictly checks headers without consuming if we used a separate stream,
	// but here we just rely on GZIPInputStream failing if it's wrong.
	}

	ChunkReader("index", fileIn).use { reader ->

	// Using a chunked approach or just iterating if the reader supports it stream-wise
	// IndexDataReader.readIndex returns an Iterable<Record>

	val nThreads = Runtime.getRuntime().availableProcessors()
	val executor = Executors.newFixedThreadPool(nThreads)
	val BATCH_SIZE = 10000
	var batch = ArrayList<String>(BATCH_SIZE)

	for (record in reader) {
	if (totalProcessed.get() < 5) {
	println("\nDebug Record: $record")
	}
	var version = record["v"]
	if (version == null) {
	val u = record["u"]
	if (u != null) {
	val pipe1 = u.indexOf('\|')
	if (pipe1 != -1) {
	val pipe2 = u.indexOf('\|', pipe1 + 1)
	if (pipe2 != -1) {
	var pipe3 = u.indexOf('\|', pipe2 + 1)
	if (pipe3 == -1) pipe3 = u.length
	version = u.substring(pipe2 + 1, pipe3)
	}
	}
	}
	}

	if (version != null) {
	batch.add(version)
	if (batch.size >= BATCH_SIZE) {
	val batchToProcess = batch
	executor.submit {
	val localCounts = HashMap<String, Long>()
	for (v in batchToProcess) {
	localCounts[v] = (localCounts[v] ?: 0L) + 1L
	}
	for ((v, count) in localCounts) {
	versionCounts.computeIfAbsent(v) { AtomicLong(0) }.addAndGet(count)
	}
	}
	batch = ArrayList(BATCH_SIZE)
	}
	}

	val count = totalProcessed.incrementAndGet()
	if (count % 10_000L == 0L) {
	print("\rProcessed $count artifacts...")
	}
	}

	// Process remaining
	if (batch.isNotEmpty()) {
	executor.submit {
	val localCounts = HashMap<String, Long>()
	for (v in batch) {
	localCounts[v] = (localCounts[v] ?: 0L) + 1L
	}
	for ((v, count) in localCounts) {
	versionCounts.computeIfAbsent(v) { AtomicLong(0) }.addAndGet(count)
	}
	}
	}

	executor.shutdown()
	executor.awaitTermination(1, TimeUnit.HOURS)
	}
	}
	println("\rProcessed ${totalProcessed.get()} artifacts total. ")
	} catch (e: Exception) {
	System.err.println("\nError processing index: ${e.message}")
	e.printStackTrace()
	exitProcess(1)
	}

	println("\nProcessing complete. Found ${versionCounts.size} unique versions.")
	println("Writing results to $OUTPUT_FILE...")

	File(OUTPUT_FILE).printWriter().use { writer ->
	writer.println("Version,Frequency,SemVer Compliance")

	versionCounts.entries.sortedByDescending { it.value.get() }.forEach { entry ->
	val version = entry.key
	val count = entry.value.get()
	val compliance = when {
	STRICT_SEMVER_REGEX.matches(version) -> "Strict"
	LENIENT_REGEX.matches(version) -> "Lenient"
	else -> "Not at all"
	}
	// Escape CSV injection or commas if necessary
	val safeVersion = if (version.contains(',')) "\"$version\"" else version
	writer.println("$safeVersion,$count,$compliance")
	}
	}

	println("Done.")
	}

	main()
No results found