George Moroz agricolamz

## ragnar_experiment.R
library(tidyverse)
library(ragnar)

# creating a store --------------------------------------------------------
ragnar_store_create(
  location = "english_linguistic_abstract",
  overwrite = TRUE,
  version = 1,
  embed = function(x){ragnar::embed_ollama(x, model = "embeddinggemma")},
  extra_cols = data.frame(author = character(),

## embrase_with_curly_brackets.R
embrase <- function(string){
  if(is.na(string)){
    string
  } else {
    string |>
      str_remove_all("[\\{\\}]") |>
      str_replace_all("((?<=[ABCDEFGHIJKLMNOPQRSTUVWXYZАБВГДЕËЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ]))",
                      "\\}\\1") |>
      str_replace_all("((?=[ABCDEFGHIJKLMNOPQRSTUVWXYZАБВГДЕËЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ]))",
                      "\\1\\{")

## llm_splitting_dictionary_entry.R
library(ellmer)
chat <- chat_ollama(model = "gemma3n")

type_dictionary_entry <- type_object(
    lemma = type_string("Begining of the string before parenthesis or list of meanings."),
    morphology = type_string("Data provided in parenthesis", required = FALSE),
    meanings = type_array(
      type_string(),
      "List of translation into Russian and examples. In most cases I expect just a sing meaning after parenthesis. If there are multiple meanings they will be listed using roman numerals like 1) 2) 3) or 1. 2. 3. The number of meanings should be exactly as much as there are items in the enumerated list. There are also examples of word usage that are listed after the colon. For each meaning there could be multiple examples",
    )

## ruscorpora.ru_API.R
# This code is licensed under the terms of the MIT license
# Author: George Moroz
# Date: 2025-08-14
# Update: 2026-01-26
# see the oficial docs: https://ruscorpora.github.io/public-api/

my_token <- "put your token here"

# I'd suggest to put your token into the .Renviron file
# ruscorpora_api_token="put your token here"

## df_to_xml.R
library(tidyverse)

readxl::read_xlsx("data.xlsx") |>
  filter(corpus == "Dagestan.xml",
         !is.na(string_id)) ->
  df

df |>
  select(where(function(x) sum(is.na(x)) != nrow(df))) |>
  select(-person_id, -corpus) |>

## ngrams_sample.csv

          
            year
            meaning_ru
            frequency
            corpus

            
              1800
              вафля
              0
              ru-2019

            
              1801
              вафля
              0
              ru-2019

            
              1802
              вафля
              0
              ru-2019

            
              1803
              вафля
              0
              ru-2019

            
              1804
              вафля
              0
              ru-2019

            
              1805
              вафля
              0
              ru-2019

            
              1806
              вафля
              0
              ru-2019

            
              1807
              вафля
              0
              ru-2019

            
              1808
              вафля
              0
              ru-2019

## code_for_6_task.R
library(tidyverse)
read_csv("https://raw.githubusercontent.com/agricolamz/2024_HSE_b_da4l/master/data/Coretta_2017_icelandic.csv") |>
  filter(speaker == "tt01") ->
  vowels

mean_data <- mean(vowels$vowel.dur)
sd_data <- sd(vowels$vowel.dur)

m1 <- function(x) dnorm(x, mean = mean_data, sd = sd_data) *
  dnorm(x, mean = 87, sd = 25)

## da4l_class_credible_intervals.R
library(tidyverse)
read_csv("https://raw.githubusercontent.com/agricolamz/2025_HSE_b_da4l/refs/heads/main/data/Coretta_2017_icelandic.csv") |>
  filter(speaker == "tt01") ->
  vowels

sd_prior <- 25
sd_data <- sd(vowels$vowel.dur)
sd_post <- 1/sqrt(1/sd_prior^2 + 1/sd_data^2)
mean_prior <- 87
mean_data <- mean(vowels$vowel.dur)

## msa_bivaltyp.R
library(tidyverse)
library(lingtypology)
df <- bivaltyp.feature()


df |>
  filter(family_WALS == "Nakh-Daghestanian") |>
  writexl::write_xlsx("~/Desktop/daghestan_bivaltyp.xlsx")

df |>

## calculate_duration_textgrids.R
library(tidyverse)
phonfieldwork::read_from_folder("...") |>
  filter(tier == 2,
         content != "") |>
  mutate(dur = time_end-time_start) |>
  summarise(duration_minutes = sum(dur)/60)
	library(tidyverse)
	library(ragnar)

	# creating a store --------------------------------------------------------
	ragnar_store_create(
	location = "english_linguistic_abstract",
	overwrite = TRUE,
	version = 1,
	embed = function(x){ragnar::embed_ollama(x, model = "embeddinggemma")},
	extra_cols = data.frame(author = character(),
	embrase <- function(string){
	if(is.na(string)){
	string
	} else {
	string \|>
	str_remove_all("[\\{\\}]") \|>
	str_replace_all("((?<=[ABCDEFGHIJKLMNOPQRSTUVWXYZАБВГДЕËЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ]))",
	"\\}\\1") \|>
	str_replace_all("((?=[ABCDEFGHIJKLMNOPQRSTUVWXYZАБВГДЕËЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ]))",
	"\\1\\{")
	library(ellmer)
	chat <- chat_ollama(model = "gemma3n")

	type_dictionary_entry <- type_object(
	lemma = type_string("Begining of the string before parenthesis or list of meanings."),
	morphology = type_string("Data provided in parenthesis", required = FALSE),
	meanings = type_array(
	type_string(),
	"List of translation into Russian and examples. In most cases I expect just a sing meaning after parenthesis. If there are multiple meanings they will be listed using roman numerals like 1) 2) 3) or 1. 2. 3. The number of meanings should be exactly as much as there are items in the enumerated list. There are also examples of word usage that are listed after the colon. For each meaning there could be multiple examples",
	)
	# This code is licensed under the terms of the MIT license
	# Author: George Moroz
	# Date: 2025-08-14
	# Update: 2026-01-26
	# see the oficial docs: https://ruscorpora.github.io/public-api/

	my_token <- "put your token here"

	# I'd suggest to put your token into the .Renviron file
	# ruscorpora_api_token="put your token here"
	library(tidyverse)

	readxl::read_xlsx("data.xlsx") \|>
	filter(corpus == "Dagestan.xml",
	!is.na(string_id)) ->
	df

	df \|>
	select(where(function(x) sum(is.na(x)) != nrow(df))) \|>
	select(-person_id, -corpus) \|>
year	meaning_ru	frequency	corpus
1800	вафля	0	ru-2019
1801	вафля	0	ru-2019
1802	вафля	0	ru-2019
1803	вафля	0	ru-2019
1804	вафля	0	ru-2019
1805	вафля	0	ru-2019
1806	вафля	0	ru-2019
1807	вафля	0	ru-2019
1808	вафля	0	ru-2019
	library(tidyverse)
	read_csv("https://raw.githubusercontent.com/agricolamz/2024_HSE_b_da4l/master/data/Coretta_2017_icelandic.csv") \|>
	filter(speaker == "tt01") ->
	vowels

	mean_data <- mean(vowels$vowel.dur)
	sd_data <- sd(vowels$vowel.dur)

	m1 <- function(x) dnorm(x, mean = mean_data, sd = sd_data) *
	dnorm(x, mean = 87, sd = 25)
	library(tidyverse)
	library(lingtypology)
	df <- bivaltyp.feature()


	df \|>
	filter(family_WALS == "Nakh-Daghestanian") \|>
	writexl::write_xlsx("~/Desktop/daghestan_bivaltyp.xlsx")

	df \|>
	library(tidyverse)
	phonfieldwork::read_from_folder("...") \|>
	filter(tier == 2,
	content != "") \|>
	mutate(dur = time_end-time_start) \|>
	summarise(duration_minutes = sum(dur)/60)