Skip to content

Instantly share code, notes, and snippets.

View agricolamz's full-sized avatar

George Moroz agricolamz

View GitHub Profile
embrase <- function(string){
if(is.na(string)){
string
} else {
string |>
str_remove_all("[\\{\\}]") |>
str_replace_all("((?<=[ABCDEFGHIJKLMNOPQRSTUVWXYZАБВГДЕËЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ]))",
"\\}\\1") |>
str_replace_all("((?=[ABCDEFGHIJKLMNOPQRSTUVWXYZАБВГДЕËЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ]))",
"\\1\\{")
library(ellmer)
chat <- chat_ollama(model = "gemma3n")
type_dictionary_entry <- type_object(
lemma = type_string("Begining of the string before parenthesis or list of meanings."),
morphology = type_string("Data provided in parenthesis", required = FALSE),
meanings = type_array(
type_string(),
"List of translation into Russian and examples. In most cases I expect just a sing meaning after parenthesis. If there are multiple meanings they will be listed using roman numerals like 1) 2) 3) or 1. 2. 3. The number of meanings should be exactly as much as there are items in the enumerated list. There are also examples of word usage that are listed after the colon. For each meaning there could be multiple examples",
)
# This code is licensed under the terms of the MIT license
# Author: George Moroz
# Date: 2025-08-14
# Update: 2025-09-16
# see the oficial docs: https://ruscorpora.github.io/public-api/
library(tidyverse)
my_token <- "put your token here"
lemma_for_search <- "накачать"
library(tidyverse)
readxl::read_xlsx("data.xlsx") |>
filter(corpus == "Dagestan.xml",
!is.na(string_id)) ->
df
df |>
select(where(function(x) sum(is.na(x)) != nrow(df))) |>
select(-person_id, -corpus) |>
year meaning_ru frequency corpus
1800 вафля 0 ru-2019
1801 вафля 0 ru-2019
1802 вафля 0 ru-2019
1803 вафля 0 ru-2019
1804 вафля 0 ru-2019
1805 вафля 0 ru-2019
1806 вафля 0 ru-2019
1807 вафля 0 ru-2019
1808 вафля 0 ru-2019
library(tidyverse)
read_csv("https://raw.githubusercontent.com/agricolamz/2024_HSE_b_da4l/master/data/Coretta_2017_icelandic.csv") |>
filter(speaker == "tt01") ->
vowels
mean_data <- mean(vowels$vowel.dur)
sd_data <- sd(vowels$vowel.dur)
m1 <- function(x) dnorm(x, mean = mean_data, sd = sd_data) *
dnorm(x, mean = 87, sd = 25)
library(tidyverse)
read_csv("https://raw.githubusercontent.com/agricolamz/2025_HSE_b_da4l/refs/heads/main/data/Coretta_2017_icelandic.csv") |>
filter(speaker == "tt01") ->
vowels
sd_prior <- 25
sd_data <- sd(vowels$vowel.dur)
sd_post <- 1/sqrt(1/sd_prior^2 + 1/sd_data^2)
mean_prior <- 87
mean_data <- mean(vowels$vowel.dur)
library(tidyverse)
library(lingtypology)
df <- bivaltyp.feature()
df |>
filter(family_WALS == "Nakh-Daghestanian") |>
writexl::write_xlsx("~/Desktop/daghestan_bivaltyp.xlsx")
df |>
library(tidyverse)
phonfieldwork::read_from_folder("...") |>
filter(tier == 2,
content != "") |>
mutate(dur = time_end-time_start) |>
summarise(duration_minutes = sum(dur)/60)
library(tidyverse)
khi <- read_csv("russian_spoken_corpora_analysis/dialect_khislavichi_udpiped_mystemed.csv")
spi <- read_csv("russian_spoken_corpora_analysis/dialect_spiridonova_buda_udpiped_mystemed.csv")
khi |>
bind_rows(spi) |>
filter(mystem_pos == "S",
str_detect(mystem_feats, "nom.pl"),
str_detect(mystem_feats, "^[mn],")) |>
writexl::write_xlsx("~/Desktop/4Sveta_N_mn_nompl.xlsx")