Last active
May 13, 2025 12:51
-
-
Save MikyPo/c7700c1f96a848b163c50dd3d3fda814 to your computer and use it in GitHub Desktop.
sql_session_counter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| '''Подсчёт сессий пользователей на сайте''' | |
| # Импорт библиотек | |
| import pandas as pd | |
| import numpy as np | |
| import random | |
| import sqlite3 | |
| from datetime import datetime, timedelta | |
| '''Генерация данных''' | |
| # Настройки генерации | |
| num_users = 10 # Количество уникальных пользователей | |
| events_per_user = 20 # Среднее количество событий на пользователя | |
| start_date = datetime(2024, 1, 1) | |
| end_date = datetime(2024, 12, 31) | |
| # Генерация | |
| data = [] | |
| for user_id in range(1, num_users + 1): | |
| # Случайное количество событий для пользователя (от 10 до 100) | |
| num_events = random.randint(10, 100) | |
| # Базовое время первого события пользователя | |
| first_event_time = start_date + timedelta( | |
| days=random.randint(0, (end_date - start_date).days), | |
| hours=random.randint(0, 23), | |
| minutes=random.randint(0, 59) | |
| ) | |
| # Генерация сессий | |
| current_time = first_event_time | |
| for _ in range(num_events): | |
| # Добавляем событие | |
| data.append({ | |
| 'user_id': user_id, | |
| 'dt': current_time | |
| }) | |
| # Следующее событие - либо в той же сессии, либо новая сессия | |
| if random.random() < 0.8: # 80% вероятность продолжения сессии | |
| delta = timedelta(seconds=random.randint(1, 300)) # 1-300 секунд между событиями | |
| else: | |
| delta = timedelta(hours=random.randint(1, 24)) # Новая сессия через 1-24 часа | |
| current_time += delta | |
| # Не выходим за границы периода | |
| if current_time > end_date: | |
| break | |
| # Создаем DataFrame | |
| df = pd.DataFrame(data) | |
| # Сортируем по времени | |
| df = df.sort_values('dt').reset_index(drop=True) | |
| # Смотрим что получилось | |
| df.info() | |
| display(df) | |
| '''Заливаем данные в БД''' | |
| # Подключаемся к SQLite (файл создастся автоматически) | |
| conn = sqlite3.connect('my_database.db') # путь к файлу | |
| # Сохраняем DataFrame в таблицу 'data' | |
| df.to_sql('client_log', conn, index=False, if_exists='replace') | |
| # Не забываем закрыть соединение | |
| conn.close() | |
| '''Запрос в БД''' | |
| # Подключаемся к базе | |
| conn = sqlite3.connect('my_database.db') | |
| # SQL-запрос | |
| query = """ | |
| SELECT * | |
| FROM client_log AS t | |
| """ | |
| result_df = pd.read_sql(query, conn) | |
| print('Сгенерированный датафрейм') | |
| display(result_df) | |
| '''Подсчёт сессий''' | |
| '''Считаем предыдущие активности для каждой строки''' | |
| # Сутки | |
| day = 24*60*60 | |
| # SQL-запрос | |
| query = f""" | |
| SELECT | |
| t.*, | |
| LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt) AS prev_dt, | |
| ROUND((JULIANDAY(t.dt) - JULIANDAY(LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt))) * {day}) AS dt_diff | |
| FROM | |
| client_log AS t | |
| """ | |
| result_df = pd.read_sql(query, conn) | |
| print('Разница в секундах текущего дествия с предыдущем') | |
| display(result_df) | |
| '''Проставляем флаг сессий (одна сессия не больше часа)''' | |
| # Одна сессия 1 ч = 60 мин. * 60 сек. | |
| one_session = 60*60 | |
| # SQL-запрос | |
| query = f""" | |
| WITH new_session AS ( | |
| SELECT | |
| t.*, | |
| LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt) AS prev_dt, | |
| ROUND((JULIANDAY(t.dt) - JULIANDAY(LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt))) * {day}) AS dt_diff, | |
| CASE WHEN ROUND((JULIANDAY(t.dt) - JULIANDAY(LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt))) * {day}) >= {one_session} | |
| THEN 1 ELSE 0 END AS new_session | |
| FROM | |
| client_log AS t) | |
| SELECT t.*, | |
| SUM(t.new_session) OVER (PARTITION BY t.user_id ORDER BY t.dt) AS session_id | |
| FROM new_session AS t | |
| """ | |
| result_df = pd.read_sql(query, conn) | |
| print('Разметка ID для сессий') | |
| display(result_df.loc[result_df['user_id'] == 3]) | |
| '''Сколько действий в сессии''' | |
| # SQL-запрос | |
| query = f""" | |
| WITH new_session AS ( | |
| SELECT | |
| t.*, | |
| LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt) AS prev_dt, | |
| ROUND((JULIANDAY(t.dt) - JULIANDAY(LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt))) * {day}) AS dt_diff, | |
| CASE WHEN ROUND((JULIANDAY(t.dt) - JULIANDAY(LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt))) * {day}) >= {one_session} | |
| THEN 1 ELSE 0 END AS new_session | |
| FROM | |
| client_log AS t), | |
| client_sessions AS ( | |
| SELECT t.*, | |
| SUM(t.new_session) OVER (PARTITION BY t.user_id ORDER BY t.dt) AS session_id | |
| FROM new_session AS t) | |
| SELECT t.user_id, t.session_id, COUNT(1) AS action_count FROM client_sessions AS t | |
| GROUP BY t.user_id, t.session_id | |
| """ | |
| result_df = pd.read_sql(query, conn) | |
| print('Количество действий у каждой сессии пользователя') | |
| display(result_df) | |
| '''Считаем количество сессий у каждого пользователя''' | |
| # SQL-запрос | |
| query = f""" | |
| WITH session_data AS ( | |
| WITH new_session AS ( | |
| SELECT | |
| t.*, | |
| LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt) AS prev_dt, | |
| ROUND((JULIANDAY(t.dt) - JULIANDAY(LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt))) * {day}) AS dt_diff, | |
| CASE | |
| WHEN ROUND((JULIANDAY(t.dt) - JULIANDAY(LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt))) * {day}) >= {one_session} | |
| THEN 1 | |
| ELSE 0 | |
| END AS new_session | |
| FROM | |
| client_log AS t | |
| ) | |
| SELECT | |
| user_id, | |
| SUM(new_session) OVER (PARTITION BY user_id ORDER BY dt) AS session_id | |
| FROM new_session | |
| ) | |
| SELECT | |
| user_id, | |
| COUNT(DISTINCT session_id) AS total_sessions | |
| FROM session_data | |
| GROUP BY user_id; | |
| """ | |
| result_df = pd.read_sql(query, conn) | |
| print('Количество сессий у каждого пользователя') | |
| display(result_df) | |
| '''Считаем количество всех сессий у всех пользователей''' | |
| # SQL-запрос | |
| query = f""" | |
| WITH new_session AS ( | |
| SELECT | |
| t.*, | |
| LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt) AS prev_dt, | |
| ROUND((JULIANDAY(t.dt) - JULIANDAY(LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt))) * {day}) AS dt_diff, | |
| CASE WHEN ROUND((JULIANDAY(t.dt) - JULIANDAY(LAG(t.dt) OVER (PARTITION BY t.user_id ORDER BY t.dt))) * {day}) >= {one_session} | |
| THEN 1 ELSE 0 END AS new_session | |
| FROM | |
| client_log AS t), | |
| client_sessions AS ( | |
| SELECT t.*, | |
| SUM(t.new_session) OVER (PARTITION BY t.user_id ORDER BY t.dt) AS session_id | |
| FROM new_session AS t), | |
| client_sessions_agg AS ( | |
| SELECT t.user_id, t.session_id, COUNT(1) AS action_count FROM client_sessions AS t | |
| GROUP BY t.user_id, t.session_id) | |
| SELECT COUNT(t.session_id) AS total_sessions FROM client_sessions_agg AS t | |
| """ | |
| result_df = pd.read_sql(query, conn) | |
| print('Всего количество сессий у всех пользователей') | |
| display(result_df) | |
| # Закрытие соединения | |
| conn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment