Skip to content

Instantly share code, notes, and snippets.

@rock3r
Created January 14, 2026 19:30
Show Gist options
  • Select an option

  • Save rock3r/f146ebeabb6a4ab2cdde8ec351aac46b to your computer and use it in GitHub Desktop.

Select an option

Save rock3r/f146ebeabb6a4ab2cdde8ec351aac46b to your computer and use it in GitHub Desktop.
A Kotlin script that fetches all unique versions on Maven Central and outputs a csv with added semver compliance info
@file:Repository("https://repo1.maven.org/maven2/")
@file:DependsOn("org.apache.maven.indexer:indexer-reader:7.1.5")
@file:DependsOn("org.slf4j:slf4j-simple:2.0.13")
import org.apache.maven.index.reader.ChunkReader
import java.io.*
import java.net.URL
import java.nio.channels.Channels
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.Executors
import java.util.concurrent.TimeUnit
import java.util.concurrent.atomic.AtomicLong
import java.util.zip.GZIPInputStream
import kotlin.system.exitProcess
val INDEX_URL = "https://maven-central-eu.storage-download.googleapis.com/maven2/.index/nexus-maven-repository-index.gz"
val OUTPUT_FILE = "versions.csv"
val TEMP_INDEX_FILE = "nexus-maven-repository-index.gz"
// SemVer 2.0.0 Regex (Official)
val STRICT_SEMVER_REGEX = Regex("""^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$""") // Corrected escaping for backslashes within the regex string
// Lenient: Starts with digit, contains at least one dot followed by digit.
// Examples: 1.0, 1.0.0.Final, 1.0-beta
val LENIENT_REGEX = Regex("^\\d+(\\.\\d+)+.*$") // Corrected escaping for backslashes within the regex string
fun main() {
val indexFile = File(TEMP_INDEX_FILE)
if (!indexFile.exists() || indexFile.length() == 0L) {
println("Downloading index from $INDEX_URL...")
try {
val connection = URL(INDEX_URL).openConnection()
val fileSize = connection.contentLengthLong
connection.getInputStream().use { input ->
FileOutputStream(indexFile).use { output ->
val buffer = ByteArray(8192)
var bytesRead: Int
var totalBytesRead = 0L
var lastReportTime = 0L
while (input.read(buffer).also { bytesRead = it } != -1) {
output.write(buffer, 0, bytesRead)
totalBytesRead += bytesRead
val currentTime = System.currentTimeMillis()
if (currentTime - lastReportTime > 500) { // Update every 500ms
val progress = if (fileSize > 0) "%.2f%%".format(totalBytesRead.toDouble() / fileSize * 100) else ""
val mbRead = totalBytesRead.toDouble() / (1024 * 1024)
print("\rDownloaded %.2f MB %s".format(mbRead, progress))
lastReportTime = currentTime
}
}
}
}
println("\rDownload complete. ")
} catch (e: Exception) {
System.err.println("\nFailed to download index: ${e.message}")
indexFile.delete() // Clean up partial file
exitProcess(1)
}
} else {
println("Using existing index file: ${indexFile.absolutePath} (${indexFile.length() / (1024 * 1024)} MB)")
}
println("Processing index...")
val versionCounts = ConcurrentHashMap<String, AtomicLong>()
val totalProcessed = AtomicLong(0)
try {
FileInputStream(indexFile).use { fileIn ->
// Verify magic number for GZIP
if (indexFile.length() > 2) {
// Peek strictly checks headers without consuming if we used a separate stream,
// but here we just rely on GZIPInputStream failing if it's wrong.
}
ChunkReader("index", fileIn).use { reader ->
// Using a chunked approach or just iterating if the reader supports it stream-wise
// IndexDataReader.readIndex returns an Iterable<Record>
val nThreads = Runtime.getRuntime().availableProcessors()
val executor = Executors.newFixedThreadPool(nThreads)
val BATCH_SIZE = 10000
var batch = ArrayList<String>(BATCH_SIZE)
for (record in reader) {
if (totalProcessed.get() < 5) {
println("\nDebug Record: $record")
}
var version = record["v"]
if (version == null) {
val u = record["u"]
if (u != null) {
val pipe1 = u.indexOf('|')
if (pipe1 != -1) {
val pipe2 = u.indexOf('|', pipe1 + 1)
if (pipe2 != -1) {
var pipe3 = u.indexOf('|', pipe2 + 1)
if (pipe3 == -1) pipe3 = u.length
version = u.substring(pipe2 + 1, pipe3)
}
}
}
}
if (version != null) {
batch.add(version)
if (batch.size >= BATCH_SIZE) {
val batchToProcess = batch
executor.submit {
val localCounts = HashMap<String, Long>()
for (v in batchToProcess) {
localCounts[v] = (localCounts[v] ?: 0L) + 1L
}
for ((v, count) in localCounts) {
versionCounts.computeIfAbsent(v) { AtomicLong(0) }.addAndGet(count)
}
}
batch = ArrayList(BATCH_SIZE)
}
}
val count = totalProcessed.incrementAndGet()
if (count % 10_000L == 0L) {
print("\rProcessed $count artifacts...")
}
}
// Process remaining
if (batch.isNotEmpty()) {
executor.submit {
val localCounts = HashMap<String, Long>()
for (v in batch) {
localCounts[v] = (localCounts[v] ?: 0L) + 1L
}
for ((v, count) in localCounts) {
versionCounts.computeIfAbsent(v) { AtomicLong(0) }.addAndGet(count)
}
}
}
executor.shutdown()
executor.awaitTermination(1, TimeUnit.HOURS)
}
}
println("\rProcessed ${totalProcessed.get()} artifacts total. ")
} catch (e: Exception) {
System.err.println("\nError processing index: ${e.message}")
e.printStackTrace()
exitProcess(1)
}
println("\nProcessing complete. Found ${versionCounts.size} unique versions.")
println("Writing results to $OUTPUT_FILE...")
File(OUTPUT_FILE).printWriter().use { writer ->
writer.println("Version,Frequency,SemVer Compliance")
versionCounts.entries.sortedByDescending { it.value.get() }.forEach { entry ->
val version = entry.key
val count = entry.value.get()
val compliance = when {
STRICT_SEMVER_REGEX.matches(version) -> "Strict"
LENIENT_REGEX.matches(version) -> "Lenient"
else -> "Not at all"
}
// Escape CSV injection or commas if necessary
val safeVersion = if (version.contains(',')) "\"$version\"" else version
writer.println("$safeVersion,$count,$compliance")
}
}
println("Done.")
}
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment