Last active
September 16, 2025 08:13
-
-
Save agricolamz/96a610fd3e029350b2f814c13f92823d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This code is licensed under the terms of the MIT license | |
| # Author: George Moroz | |
| # Date: 2025-08-14 | |
| # Update: 2025-09-16 | |
| # see the oficial docs: https://ruscorpora.github.io/public-api/ | |
| library(tidyverse) | |
| my_token <- "put your token here" | |
| lemma_for_search <- "накачать" | |
| library(httr2) | |
| # check authentication ---------------------------------------------------- | |
| request("https://ruscorpora.ru/api/v1/auth/check-authenticated/") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_perform() | |
| # <httr2_response> | |
| # GET https://ruscorpora.ru/api/v1/auth/check-authenticated/ | |
| # Status: 200 OK | |
| # Content-Type: application/json | |
| # Body: In memory (4 bytes) | |
| # PORTRAIT_WORD_INFO ------------------------------------------------------ | |
| request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_headers("accept" = "application/json") |> | |
| req_headers("Content-Type" = "application/json") |> | |
| req_body_json(list(lemma = lemma_for_search, | |
| corpus = list(type = "MAIN"), | |
| resultType = list("PORTRAIT_WORD_INFO"))) |> | |
| req_perform() |> | |
| resp_body_json() -> | |
| result | |
| result$propsData$items |> | |
| map(function(i){ | |
| if(length(i$parsingFields) == 4){ | |
| tibble(text = paste0(unlist(i$parsingFields[[1]]$value), collapse = "; "), | |
| lex = paste0(unlist(i$parsingFields[[2]]$value), collapse = "; "), | |
| gr = paste0(unlist(i$parsingFields[[3]]$value), collapse = "; "), | |
| sem = paste0(unlist(i$parsingFields[[4]]$value), collapse = "; ")) | |
| } else if(length(i$parsingFields) == 3){ | |
| tibble(text = paste0(unlist(i$parsingFields[[1]]$value), collapse = "; "), | |
| lex = paste0(unlist(i$parsingFields[[2]]$value), collapse = "; "), | |
| gr = paste0(unlist(i$parsingFields[[3]]$value), collapse = "; ")) | |
| } | |
| }) |> | |
| list_rbind() | |
| # PORTRAIT_CONCORDANCE ---------------------------------------------------- | |
| request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_headers("accept" = "application/json") |> | |
| req_headers("Content-Type" = "application/json") |> | |
| req_body_json(list(lemma = lemma_for_search, | |
| corpus = list(type = "MAIN"), | |
| resultType = list("PORTRAIT_CONCORDANCE"))) |> | |
| req_perform() |> | |
| resp_body_json() -> | |
| result | |
| result$concordanceData$groups |> | |
| seq_along() |> | |
| map(function(j){ | |
| tibble(field = result$concordanceData$groups[[j]]$docs[[1]]$info$docExplainInfo$items[[1]]$parsingFields |> | |
| map_chr("name"), | |
| value = result$concordanceData$groups[[j]]$docs[[1]]$info$docExplainInfo$items[[1]]$parsingFields |> | |
| map("value") |> | |
| unlist()) |> | |
| pivot_wider(names_from = "field", values_from = "value") |> | |
| mutate(title = result$concordanceData$groups[[j]]$docs[[1]]$info$title, | |
| language = result$concordanceData$groups[[j]]$docs[[1]]$snippetGroups[[1]]$snippets[[1]]$langInfo$lang, | |
| text = result$concordanceData$groups[[j]]$docs[[1]]$snippetGroups[[1]]$snippets[[1]]$sequences[[1]]$words |> | |
| map_chr("text") |> | |
| str_c(collapse = "") |> | |
| str_squish()) | |
| }) |> | |
| list_rbind() |> | |
| mutate(lemma = lemma_for_search) |> | |
| relocate(title, .before = 1) |> | |
| relocate(text, .before = 1) |> | |
| relocate(lemma, .before = 1) |> | |
| View() | |
| # PORTRAIT_STATS ---------------------------------------------------- | |
| request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_headers("accept" = "application/json") |> | |
| req_headers("Content-Type" = "application/json") |> | |
| req_body_json(list(lemma = lemma_for_search, | |
| corpus = list(type = "MAIN"), | |
| resultType = list("PORTRAIT_STATS"))) |> | |
| req_perform() |> | |
| resp_body_json() -> | |
| result | |
| result$statsData$fieldStats |> | |
| seq_along() |> | |
| map(function(i){ | |
| result$statsData$fieldStats[[i]]$values |> | |
| seq_along() |> | |
| map(function(j){ | |
| tibble(value = result$statsData$fieldStats[[i]]$values[[j]]$key$valString$v, | |
| count = result$statsData$fieldStats[[i]]$values[[j]]$count, | |
| docCount = result$statsData$fieldStats[[i]]$values[[j]]$docCount, | |
| totalCount = result$statsData$fieldStats[[i]]$values[[j]]$totalCount, | |
| totalDocCount = result$statsData$fieldStats[[i]]$values[[j]]$totalDocCount) | |
| }) |> | |
| list_rbind() |> | |
| mutate(field = result$statsData$fieldStats[[i]]$field) |> | |
| relocate(field, .before = 1) | |
| }) |> | |
| list_rbind() | |
| # PORTRAIT_SKETCH ---------------------------------------------------- | |
| request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_headers("accept" = "application/json") |> | |
| req_headers("Content-Type" = "application/json") |> | |
| req_body_json(list(lemma = lemma_for_search, | |
| corpus = list(type = "MAIN"), | |
| resultType = list("PORTRAIT_SKETCH"))) |> | |
| req_perform() |> | |
| resp_body_json() -> | |
| result | |
| result$sketchData$collocates |> | |
| seq_along() |> | |
| map(function(i){ | |
| result$sketchData$collocates[[i]]$collocations |> | |
| seq_along() |> | |
| map(function(j){ | |
| tibble(collocate = result$sketchData$collocates[[i]]$collocations[[j]]$collocate$valString$v, | |
| dice = result$sketchData$collocates[[i]]$collocations[[j]]$metrics[[1]]$value) | |
| }) |> | |
| list_rbind() |> | |
| mutate(syntactic_relation = result$sketchData$collocates[[i]][["sketchSynRelation"]]) | |
| }) |> | |
| list_rbind() |> | |
| mutate(lemma = lemma_for_search) | |
| # PORTRAIT_FREQUENCY ---------------------------------------------------- | |
| request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_headers("accept" = "application/json") |> | |
| req_headers("Content-Type" = "application/json") |> | |
| req_body_json(list(lemma = lemma_for_search, | |
| corpus = list(type = "MAIN"), | |
| resultType = list("PORTRAIT_FREQUENCY"))) |> | |
| req_perform() |> | |
| resp_body_json() -> | |
| result | |
| result$frequencyData$ipm | |
| # PORTRAIT_SIMILAR ---------------------------------------------------- | |
| request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_headers("accept" = "application/json") |> | |
| req_headers("Content-Type" = "application/json") |> | |
| req_body_json(list(lemma = lemma_for_search, | |
| corpus = list(type = "MAIN"), | |
| resultType = list("PORTRAIT_SIMILAR"))) |> | |
| req_perform() |> | |
| resp_body_json() -> | |
| result | |
| tibble(word = result$similarData[[1]]$values |> map_chr("word"), | |
| metics = result$similarData[[1]]$values |> map_dbl("weight")) | |
| # PORTRAIT_MORPHEME ---------------------------------------------------- | |
| request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_headers("accept" = "application/json") |> | |
| req_headers("Content-Type" = "application/json") |> | |
| req_body_json(list(lemma = lemma_for_search, | |
| corpus = list(type = "MAIN"), | |
| resultType = list("PORTRAIT_MORPHEME"))) |> | |
| req_perform() |> | |
| resp_body_json() -> | |
| result | |
| tibble(glossed = result$morphemeData$morphemes |> map_chr("value"), | |
| morph_type = result$morphemeData$morphemes |> map_chr("type")) | |
| # PORTRAIT_WORDFORMS ---------------------------------------------------- | |
| request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_headers("accept" = "application/json") |> | |
| req_headers("Content-Type" = "application/json") |> | |
| req_body_json(list(lemma = "стол", | |
| corpus = list(type = "MAIN"), | |
| resultType = list("PORTRAIT_WORDFORMS"))) |> | |
| req_perform() |> | |
| resp_body_json() -> | |
| result | |
| result$wordformsData$values |> | |
| seq_along() |> | |
| map(function(i){ | |
| tibble(case = result$wordformsData$values[[i]]$rowLabel$v, | |
| number = result$wordformsData$values[[i]]$columnLabel$v, | |
| form = result$wordformsData$values[[i]]$wfValue$value, | |
| ipm = result$wordformsData$values[[i]]$wfValue$freq$ipm, | |
| # категория от 1 до 3. Где 1 - наиболее встречаемая форма, 3 - наименее. | |
| category = result$wordformsData$values[[i]]$wfValue$freq$category) | |
| }) |> | |
| list_rbind() |> | |
| # PORTRAIT_COGNATES ---------------------------------------------------- | |
| request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_headers("accept" = "application/json") |> | |
| req_headers("Content-Type" = "application/json") |> | |
| req_body_json(list(lemma = lemma_for_search, | |
| corpus = list(type = "MAIN"), | |
| resultType = list("PORTRAIT_COGNATES"))) |> | |
| req_perform() |> | |
| resp_body_json() -> | |
| result | |
| # does not work yet | |
| # PORTRAIT_FIRST_MENTION ---------------------------------------------------- | |
| request("https://ruscorpora.ru/api/v1/word-portrait/") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_headers("accept" = "application/json") |> | |
| req_headers("Content-Type" = "application/json") |> | |
| req_body_json(list(lemma = lemma_for_search, | |
| corpus = list(type = "MAIN"), | |
| resultType = list("PORTRAIT_FIRST_MENTION"))) |> | |
| req_perform() |> | |
| resp_body_json() -> | |
| result | |
| result$firstMentionData$info$items[[1]]$parsingFields |> | |
| seq_along() |> | |
| map(function(i){ | |
| tibble(field = result$firstMentionData$info$items[[1]]$parsingFields[[i]]$name, | |
| value = unlist(result$firstMentionData$info$items[[1]]$parsingFields[[i]]$value)) | |
| }) |> | |
| list_rbind() |> | |
| pivot_wider(names_from = field, values_from = value) |> | |
| mutate(redirect_lemma = result$firstMentionData$redirectLemma, | |
| redirect_corpus = result$firstMentionData$redirectCorpus$type) | |
| # search ------------------------------------------------------------------ | |
| lemma_for_search <- "печь" | |
| grammatical_features <- "(S) & (gen | gen2 | dat) & (sg | pl)" | |
| request("https://ruscorpora.ru/api/v1/lex-gramm/concordance") |> | |
| req_headers("Authorization" = str_glue("Bearer {my_token}")) |> | |
| req_headers("accept" = "application/json") |> | |
| req_headers("Content-Type" = "application/json") |> | |
| req_body_json(list(corpus = list(type = "MAIN"), | |
| lexGramm = list(sectionValues = | |
| list(list(subsectionValues = | |
| list( | |
| list(conditionValues = | |
| list( | |
| list(fieldName = "lex", | |
| text = list(v = lemma_for_search)), | |
| list(fieldName = "gramm", | |
| text = list(v = grammatical_features)))))))))) |> | |
| req_perform() |> | |
| resp_body_json() -> | |
| result | |
| result$groups |> | |
| seq_along() |> | |
| map(function(j){ | |
| result$groups[[j]]$docs[[1]]$snippetGroups[[1]]$snippets[[1]]$sequences[[1]]$words |> | |
| map_chr("text") |> | |
| str_c(collapse = "") |> | |
| str_squish() -> | |
| text | |
| result$groups[[j]]$docs[[1]]$info$docExplainInfo$items[[1]]$parsingFields |> | |
| seq_along() |> | |
| map(function(i){ | |
| tibble(type = result$groups[[j]]$docs[[1]]$info$docExplainInfo$items[[1]]$parsingFields[[i]]$name, | |
| value = result$groups[[j]]$docs[[1]]$info$docExplainInfo$items[[1]]$parsingFields[[i]]$value[[1]]$valString$v) | |
| }) |> | |
| list_rbind() |> | |
| pivot_wider(names_from = type, values_from = value) |> | |
| mutate(title = result$groups[[j]]$docs[[1]]$info$title, | |
| text = text, | |
| lemma = lemma_for_search, | |
| grammatical_features = grammatical_features) |> | |
| relocate(title, .before = 1) |> | |
| relocate(text, .before = 1) |> | |
| relocate(grammatical_features, .before = 1) |> | |
| relocate(lemma, .before = 1) | |
| }) |> | |
| list_rbind() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment