Created
January 14, 2026 19:30
-
-
Save rock3r/f146ebeabb6a4ab2cdde8ec351aac46b to your computer and use it in GitHub Desktop.
A Kotlin script that fetches all unique versions on Maven Central and outputs a csv with added semver compliance info
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| @file:Repository("https://repo1.maven.org/maven2/") | |
| @file:DependsOn("org.apache.maven.indexer:indexer-reader:7.1.5") | |
| @file:DependsOn("org.slf4j:slf4j-simple:2.0.13") | |
| import org.apache.maven.index.reader.ChunkReader | |
| import java.io.* | |
| import java.net.URL | |
| import java.nio.channels.Channels | |
| import java.util.concurrent.ConcurrentHashMap | |
| import java.util.concurrent.Executors | |
| import java.util.concurrent.TimeUnit | |
| import java.util.concurrent.atomic.AtomicLong | |
| import java.util.zip.GZIPInputStream | |
| import kotlin.system.exitProcess | |
| val INDEX_URL = "https://maven-central-eu.storage-download.googleapis.com/maven2/.index/nexus-maven-repository-index.gz" | |
| val OUTPUT_FILE = "versions.csv" | |
| val TEMP_INDEX_FILE = "nexus-maven-repository-index.gz" | |
| // SemVer 2.0.0 Regex (Official) | |
| val STRICT_SEMVER_REGEX = Regex("""^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$""") // Corrected escaping for backslashes within the regex string | |
| // Lenient: Starts with digit, contains at least one dot followed by digit. | |
| // Examples: 1.0, 1.0.0.Final, 1.0-beta | |
| val LENIENT_REGEX = Regex("^\\d+(\\.\\d+)+.*$") // Corrected escaping for backslashes within the regex string | |
| fun main() { | |
| val indexFile = File(TEMP_INDEX_FILE) | |
| if (!indexFile.exists() || indexFile.length() == 0L) { | |
| println("Downloading index from $INDEX_URL...") | |
| try { | |
| val connection = URL(INDEX_URL).openConnection() | |
| val fileSize = connection.contentLengthLong | |
| connection.getInputStream().use { input -> | |
| FileOutputStream(indexFile).use { output -> | |
| val buffer = ByteArray(8192) | |
| var bytesRead: Int | |
| var totalBytesRead = 0L | |
| var lastReportTime = 0L | |
| while (input.read(buffer).also { bytesRead = it } != -1) { | |
| output.write(buffer, 0, bytesRead) | |
| totalBytesRead += bytesRead | |
| val currentTime = System.currentTimeMillis() | |
| if (currentTime - lastReportTime > 500) { // Update every 500ms | |
| val progress = if (fileSize > 0) "%.2f%%".format(totalBytesRead.toDouble() / fileSize * 100) else "" | |
| val mbRead = totalBytesRead.toDouble() / (1024 * 1024) | |
| print("\rDownloaded %.2f MB %s".format(mbRead, progress)) | |
| lastReportTime = currentTime | |
| } | |
| } | |
| } | |
| } | |
| println("\rDownload complete. ") | |
| } catch (e: Exception) { | |
| System.err.println("\nFailed to download index: ${e.message}") | |
| indexFile.delete() // Clean up partial file | |
| exitProcess(1) | |
| } | |
| } else { | |
| println("Using existing index file: ${indexFile.absolutePath} (${indexFile.length() / (1024 * 1024)} MB)") | |
| } | |
| println("Processing index...") | |
| val versionCounts = ConcurrentHashMap<String, AtomicLong>() | |
| val totalProcessed = AtomicLong(0) | |
| try { | |
| FileInputStream(indexFile).use { fileIn -> | |
| // Verify magic number for GZIP | |
| if (indexFile.length() > 2) { | |
| // Peek strictly checks headers without consuming if we used a separate stream, | |
| // but here we just rely on GZIPInputStream failing if it's wrong. | |
| } | |
| ChunkReader("index", fileIn).use { reader -> | |
| // Using a chunked approach or just iterating if the reader supports it stream-wise | |
| // IndexDataReader.readIndex returns an Iterable<Record> | |
| val nThreads = Runtime.getRuntime().availableProcessors() | |
| val executor = Executors.newFixedThreadPool(nThreads) | |
| val BATCH_SIZE = 10000 | |
| var batch = ArrayList<String>(BATCH_SIZE) | |
| for (record in reader) { | |
| if (totalProcessed.get() < 5) { | |
| println("\nDebug Record: $record") | |
| } | |
| var version = record["v"] | |
| if (version == null) { | |
| val u = record["u"] | |
| if (u != null) { | |
| val pipe1 = u.indexOf('|') | |
| if (pipe1 != -1) { | |
| val pipe2 = u.indexOf('|', pipe1 + 1) | |
| if (pipe2 != -1) { | |
| var pipe3 = u.indexOf('|', pipe2 + 1) | |
| if (pipe3 == -1) pipe3 = u.length | |
| version = u.substring(pipe2 + 1, pipe3) | |
| } | |
| } | |
| } | |
| } | |
| if (version != null) { | |
| batch.add(version) | |
| if (batch.size >= BATCH_SIZE) { | |
| val batchToProcess = batch | |
| executor.submit { | |
| val localCounts = HashMap<String, Long>() | |
| for (v in batchToProcess) { | |
| localCounts[v] = (localCounts[v] ?: 0L) + 1L | |
| } | |
| for ((v, count) in localCounts) { | |
| versionCounts.computeIfAbsent(v) { AtomicLong(0) }.addAndGet(count) | |
| } | |
| } | |
| batch = ArrayList(BATCH_SIZE) | |
| } | |
| } | |
| val count = totalProcessed.incrementAndGet() | |
| if (count % 10_000L == 0L) { | |
| print("\rProcessed $count artifacts...") | |
| } | |
| } | |
| // Process remaining | |
| if (batch.isNotEmpty()) { | |
| executor.submit { | |
| val localCounts = HashMap<String, Long>() | |
| for (v in batch) { | |
| localCounts[v] = (localCounts[v] ?: 0L) + 1L | |
| } | |
| for ((v, count) in localCounts) { | |
| versionCounts.computeIfAbsent(v) { AtomicLong(0) }.addAndGet(count) | |
| } | |
| } | |
| } | |
| executor.shutdown() | |
| executor.awaitTermination(1, TimeUnit.HOURS) | |
| } | |
| } | |
| println("\rProcessed ${totalProcessed.get()} artifacts total. ") | |
| } catch (e: Exception) { | |
| System.err.println("\nError processing index: ${e.message}") | |
| e.printStackTrace() | |
| exitProcess(1) | |
| } | |
| println("\nProcessing complete. Found ${versionCounts.size} unique versions.") | |
| println("Writing results to $OUTPUT_FILE...") | |
| File(OUTPUT_FILE).printWriter().use { writer -> | |
| writer.println("Version,Frequency,SemVer Compliance") | |
| versionCounts.entries.sortedByDescending { it.value.get() }.forEach { entry -> | |
| val version = entry.key | |
| val count = entry.value.get() | |
| val compliance = when { | |
| STRICT_SEMVER_REGEX.matches(version) -> "Strict" | |
| LENIENT_REGEX.matches(version) -> "Lenient" | |
| else -> "Not at all" | |
| } | |
| // Escape CSV injection or commas if necessary | |
| val safeVersion = if (version.contains(',')) "\"$version\"" else version | |
| writer.println("$safeVersion,$count,$compliance") | |
| } | |
| } | |
| println("Done.") | |
| } | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment