ChakshuGautam/story_query.py

## story_query.py
"""
StoryQuery model with child safety scoring and dynamic field boosts.
Uses Google Gemini (gemini-2.5-flash-lite) for structured query extraction.

Dependencies:
    pip install google-genai pydantic
"""

import os
import json
from pydantic import BaseModel, Field
from typing import List
from google import genai
from google.genai import types

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
MODEL_NAME = os.getenv("GEMINI_MODEL", "gemini-2.5-flash-lite")

client = genai.Client(api_key=GOOGLE_API_KEY)


class FieldBoosts(BaseModel):
    title: float = 40.0
    english_title: float = 18.0
    synopsis: float = 5.0
    tags_name: float = 12.0
    hidden_tags: float = 12.0
    illustrators: float = 15.0
    authors: float = 15.0
    language: float = 15.0


class StoryQuery(BaseModel):
    language: str
    subject: List[str]
    authors: List[str]
    level: int
    safety_score: float = Field(
        description="Child safety score: 0.0 (completely unsafe) to 1.0 (fully safe for children)"
    )
    field_boosts: FieldBoosts


PROMPT_TEMPLATE = """
You are a highly efficient and constrained Query Processor for a children's story search engine.
Your task is to transform a user's natural language request into a single, clean JSON object.

# INSTRUCTIONS AND CONSTRAINTS

1. **Strict Output Format:** Return only a valid JSON object matching the schema below.

2. **Child Safety Score (EVALUATE FIRST):**
   * Set `safety_score` as a float between 0.0 and 1.0:
     - 1.0   = completely safe and appropriate for children
     - 0.7–0.99 = minor concerns (mild cartoon violence, slightly scary themes)
     - 0.4–0.69 = moderate concern (themes unsuitable for young children but not explicit)
     - 0.0–0.39 = clearly unsafe (sexual content, graphic violence, drugs, weapons, hate speech)
   * The caller decides the block threshold — always provide a precise score.

3. **Schema and Field Rules:**
   * `language`: Full language name in English (e.g., "English", "Hindi"). Detect from input
     if not explicitly requested.
   * `subject`: Translate to English, strip filler phrases, decompose into 1–2 strong keywords.
   * `authors`: Extract specific author names. Use [] if none.
   * `level`: 1=beginner/small kids, 2=general (default), 3=young adult, 4=adult.
   * `field_boosts`: Adjust Elasticsearch field boost weights based on query intent.
     Defaults: title=40, english_title=18, synopsis=5, tags_name=12, hidden_tags=12,
     illustrators=15, authors=15, language=15.
     - Specific title mentioned       → raise `title` and `english_title`
     - Specific author/illustrator    → raise `authors` or `illustrators`
     - Thematic query                 → raise `synopsis`, `tags_name`, `hidden_tags`
     - Language explicitly requested  → raise `language`
     - Otherwise: keep defaults

Now process this input:
"""


def translate_and_refine(text: str) -> dict:
    """Extract structured search parameters from a natural language query."""
    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=PROMPT_TEMPLATE + text,
        config=types.GenerateContentConfig(
            max_output_tokens=300,
            temperature=0.7,
            response_mime_type="application/json",
            response_schema=StoryQuery,
        ),
    )
    parsed = StoryQuery.model_validate_json(response.text)
    return parsed.model_dump()


if __name__ == "__main__":
    queries = [
        "stories about friendship and sharing for kids",
        "show me a Hindi story about a brave little girl who saves her village",
        "story where the hero fights a scary monster",
        "a story about a drug dealer",
        "story with graphic murder and gore",
    ]

    for query in queries:
        result = translate_and_refine(query)
        score = result["safety_score"]
        icon = "🟢" if score >= 0.7 else ("🟡" if score >= 0.4 else "🔴")
        print(f"{icon} safety={score:.2f} | subject={result['subject']}")
        print(f"   query : {query}")
        print(f"   boosts: {result['field_boosts']}")
        print()

## story_query.rb
# StoryQuery model with child safety scoring and dynamic field boosts.
# Uses Google Gemini (gemini-2.5-flash-lite) for structured query extraction.
#
# Dependencies:
#   gem install faraday json

require "faraday"
require "json"

GOOGLE_API_KEY = ENV.fetch("GOOGLE_API_KEY")
MODEL_NAME     = ENV.fetch("GEMINI_MODEL", "gemini-2.5-flash-lite")
GEMINI_URL     = "https://generativelanguage.googleapis.com/v1beta/models/#{MODEL_NAME}:generateContent"

FIELD_BOOST_DEFAULTS = {
  "title"         => 40.0,
  "english_title" => 18.0,
  "synopsis"      =>  5.0,
  "tags_name"     => 12.0,
  "hidden_tags"   => 12.0,
  "illustrators"  => 15.0,
  "authors"       => 15.0,
  "language"      => 15.0
}.freeze

RESPONSE_SCHEMA = {
  type: "OBJECT",
  properties: {
    language:      { type: "STRING" },
    subject:       { type: "ARRAY", items: { type: "STRING" } },
    authors:       { type: "ARRAY", items: { type: "STRING" } },
    level:         { type: "INTEGER" },
    safety_score: {
      type: "NUMBER",
      description: "Child safety score: 0.0 (completely unsafe) to 1.0 (fully safe for children)"
    },
    field_boosts: {
      type: "OBJECT",
      properties: {
        title:         { type: "NUMBER" },
        english_title: { type: "NUMBER" },
        synopsis:      { type: "NUMBER" },
        tags_name:     { type: "NUMBER" },
        hidden_tags:   { type: "NUMBER" },
        illustrators:  { type: "NUMBER" },
        authors:       { type: "NUMBER" },
        language:      { type: "NUMBER" }
      },
      required: %w[title english_title synopsis tags_name hidden_tags illustrators authors language]
    }
  },
  required: %w[language subject authors level safety_score field_boosts]
}.freeze

PROMPT_TEMPLATE = <<~PROMPT
  You are a highly efficient and constrained Query Processor for a children's story search engine.
  Your task is to transform a user's natural language request into a single, clean JSON object.

  # INSTRUCTIONS AND CONSTRAINTS

  1. **Strict Output Format:** Return only a valid JSON object matching the schema below.

  2. **Child Safety Score (EVALUATE FIRST):**
     * Set `safety_score` as a float between 0.0 and 1.0:
       - 1.0      = completely safe and appropriate for children
       - 0.7–0.99 = minor concerns (mild cartoon violence, slightly scary themes)
       - 0.4–0.69 = moderate concern (themes unsuitable for young children but not explicit)
       - 0.0–0.39 = clearly unsafe (sexual content, graphic violence, drugs, weapons, hate speech)
     * The caller decides the block threshold — always provide a precise score.

  3. **Schema and Field Rules:**
     * `language`: Full language name in English (e.g., "English", "Hindi"). Detect from input
       if not explicitly requested.
     * `subject`: Translate to English, strip filler phrases, decompose into 1–2 strong keywords.
     * `authors`: Extract specific author names. Use [] if none.
     * `level`: 1=beginner/small kids, 2=general (default), 3=young adult, 4=adult.
     * `field_boosts`: Adjust Elasticsearch field boost weights based on query intent.
       Defaults: title=40, english_title=18, synopsis=5, tags_name=12, hidden_tags=12,
       illustrators=15, authors=15, language=15.
       - Specific title mentioned       → raise `title` and `english_title`
       - Specific author/illustrator    → raise `authors` or `illustrators`
       - Thematic query                 → raise `synopsis`, `tags_name`, `hidden_tags`
       - Language explicitly requested  → raise `language`
       - Otherwise: keep defaults

  Now process this input:
PROMPT

def translate_and_refine(text)
  conn = Faraday.new(url: GEMINI_URL) do |f|
    f.request  :json
    f.response :json
    f.adapter  Faraday.default_adapter
  end

  body = {
    contents: [{ parts: [{ text: PROMPT_TEMPLATE + text }] }],
    generationConfig: {
      maxOutputTokens:  300,
      temperature:      0.7,
      responseMimeType: "application/json",
      responseSchema:   RESPONSE_SCHEMA
    }
  }

  response = conn.post("?key=#{GOOGLE_API_KEY}", body)
  raise "Gemini error: #{response.body}" unless response.success?

  raw = response.body.dig("candidates", 0, "content", "parts", 0, "text")
  result = JSON.parse(raw)

  # Fill any missing boost keys with defaults
  result["field_boosts"] = FIELD_BOOST_DEFAULTS.merge(result["field_boosts"] || {})
  result
end

if __FILE__ == $0
  queries = [
    "stories about friendship and sharing for kids",
    "show me a Hindi story about a brave little girl who saves her village",
    "story where the hero fights a scary monster",
    "a story about a drug dealer",
    "story with graphic murder and gore"
  ]

  queries.each do |query|
    result = translate_and_refine(query)
    score  = result["safety_score"].to_f
    icon   = score >= 0.7 ? "🟢" : (score >= 0.4 ? "🟡" : "🔴")

    puts "#{icon} safety=#{format('%.2f', score)} | subject=#{result['subject']}"
    puts "   query : #{query}"
    puts "   boosts: #{result['field_boosts']}"
    puts
  end
end
	"""
	StoryQuery model with child safety scoring and dynamic field boosts.
	Uses Google Gemini (gemini-2.5-flash-lite) for structured query extraction.

	Dependencies:
	pip install google-genai pydantic
	"""

	import os
	import json
	from pydantic import BaseModel, Field
	from typing import List
	from google import genai
	from google.genai import types

	GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
	MODEL_NAME = os.getenv("GEMINI_MODEL", "gemini-2.5-flash-lite")

	client = genai.Client(api_key=GOOGLE_API_KEY)


	class FieldBoosts(BaseModel):
	title: float = 40.0
	english_title: float = 18.0
	synopsis: float = 5.0
	tags_name: float = 12.0
	hidden_tags: float = 12.0
	illustrators: float = 15.0
	authors: float = 15.0
	language: float = 15.0


	class StoryQuery(BaseModel):
	language: str
	subject: List[str]
	authors: List[str]
	level: int
	safety_score: float = Field(
	description="Child safety score: 0.0 (completely unsafe) to 1.0 (fully safe for children)"
	)
	field_boosts: FieldBoosts


	PROMPT_TEMPLATE = """
	You are a highly efficient and constrained Query Processor for a children's story search engine.
	Your task is to transform a user's natural language request into a single, clean JSON object.

	# INSTRUCTIONS AND CONSTRAINTS

	1. Strict Output Format: Return only a valid JSON object matching the schema below.

	2. Child Safety Score (EVALUATE FIRST):
	* Set `safety_score` as a float between 0.0 and 1.0:
	- 1.0 = completely safe and appropriate for children
	- 0.7–0.99 = minor concerns (mild cartoon violence, slightly scary themes)
	- 0.4–0.69 = moderate concern (themes unsuitable for young children but not explicit)
	- 0.0–0.39 = clearly unsafe (sexual content, graphic violence, drugs, weapons, hate speech)
	* The caller decides the block threshold — always provide a precise score.

	3. Schema and Field Rules:
	* `language`: Full language name in English (e.g., "English", "Hindi"). Detect from input
	if not explicitly requested.
	* `subject`: Translate to English, strip filler phrases, decompose into 1–2 strong keywords.
	* `authors`: Extract specific author names. Use [] if none.
	* `level`: 1=beginner/small kids, 2=general (default), 3=young adult, 4=adult.
	* `field_boosts`: Adjust Elasticsearch field boost weights based on query intent.
	Defaults: title=40, english_title=18, synopsis=5, tags_name=12, hidden_tags=12,
	illustrators=15, authors=15, language=15.
	- Specific title mentioned → raise `title` and `english_title`
	- Specific author/illustrator → raise `authors` or `illustrators`
	- Thematic query → raise `synopsis`, `tags_name`, `hidden_tags`
	- Language explicitly requested → raise `language`
	- Otherwise: keep defaults

	Now process this input:
	"""


	def translate_and_refine(text: str) -> dict:
	"""Extract structured search parameters from a natural language query."""
	response = client.models.generate_content(
	model=MODEL_NAME,
	contents=PROMPT_TEMPLATE + text,
	config=types.GenerateContentConfig(
	max_output_tokens=300,
	temperature=0.7,
	response_mime_type="application/json",
	response_schema=StoryQuery,
	),
	)
	parsed = StoryQuery.model_validate_json(response.text)
	return parsed.model_dump()


	if __name__ == "__main__":
	queries = [
	"stories about friendship and sharing for kids",
	"show me a Hindi story about a brave little girl who saves her village",
	"story where the hero fights a scary monster",
	"a story about a drug dealer",
	"story with graphic murder and gore",
	]

	for query in queries:
	result = translate_and_refine(query)
	score = result["safety_score"]
	icon = "🟢" if score >= 0.7 else ("🟡" if score >= 0.4 else "🔴")
	print(f"{icon} safety={score:.2f} \| subject={result['subject']}")
	print(f" query : {query}")
	print(f" boosts: {result['field_boosts']}")
	print()
	# StoryQuery model with child safety scoring and dynamic field boosts.
	# Uses Google Gemini (gemini-2.5-flash-lite) for structured query extraction.
	#
	# Dependencies:
	# gem install faraday json

	require "faraday"
	require "json"

	GOOGLE_API_KEY = ENV.fetch("GOOGLE_API_KEY")
	MODEL_NAME = ENV.fetch("GEMINI_MODEL", "gemini-2.5-flash-lite")
	GEMINI_URL = "https://generativelanguage.googleapis.com/v1beta/models/#{MODEL_NAME}:generateContent"

	FIELD_BOOST_DEFAULTS = {
	"title" => 40.0,
	"english_title" => 18.0,
	"synopsis" => 5.0,
	"tags_name" => 12.0,
	"hidden_tags" => 12.0,
	"illustrators" => 15.0,
	"authors" => 15.0,
	"language" => 15.0
	}.freeze

	RESPONSE_SCHEMA = {
	type: "OBJECT",
	properties: {
	language: { type: "STRING" },
	subject: { type: "ARRAY", items: { type: "STRING" } },
	authors: { type: "ARRAY", items: { type: "STRING" } },
	level: { type: "INTEGER" },
	safety_score: {
	type: "NUMBER",
	description: "Child safety score: 0.0 (completely unsafe) to 1.0 (fully safe for children)"
	},
	field_boosts: {
	type: "OBJECT",
	properties: {
	title: { type: "NUMBER" },
	english_title: { type: "NUMBER" },
	synopsis: { type: "NUMBER" },
	tags_name: { type: "NUMBER" },
	hidden_tags: { type: "NUMBER" },
	illustrators: { type: "NUMBER" },
	authors: { type: "NUMBER" },
	language: { type: "NUMBER" }
	},
	required: %w[title english_title synopsis tags_name hidden_tags illustrators authors language]
	}
	},
	required: %w[language subject authors level safety_score field_boosts]
	}.freeze

	PROMPT_TEMPLATE = <<~PROMPT
	You are a highly efficient and constrained Query Processor for a children's story search engine.
	Your task is to transform a user's natural language request into a single, clean JSON object.

	# INSTRUCTIONS AND CONSTRAINTS

	1. Strict Output Format: Return only a valid JSON object matching the schema below.

	2. Child Safety Score (EVALUATE FIRST):
	* Set `safety_score` as a float between 0.0 and 1.0:
	- 1.0 = completely safe and appropriate for children
	- 0.7–0.99 = minor concerns (mild cartoon violence, slightly scary themes)
	- 0.4–0.69 = moderate concern (themes unsuitable for young children but not explicit)
	- 0.0–0.39 = clearly unsafe (sexual content, graphic violence, drugs, weapons, hate speech)
	* The caller decides the block threshold — always provide a precise score.

	3. Schema and Field Rules:
	* `language`: Full language name in English (e.g., "English", "Hindi"). Detect from input
	if not explicitly requested.
	* `subject`: Translate to English, strip filler phrases, decompose into 1–2 strong keywords.
	* `authors`: Extract specific author names. Use [] if none.
	* `level`: 1=beginner/small kids, 2=general (default), 3=young adult, 4=adult.
	* `field_boosts`: Adjust Elasticsearch field boost weights based on query intent.
	Defaults: title=40, english_title=18, synopsis=5, tags_name=12, hidden_tags=12,
	illustrators=15, authors=15, language=15.
	- Specific title mentioned → raise `title` and `english_title`
	- Specific author/illustrator → raise `authors` or `illustrators`
	- Thematic query → raise `synopsis`, `tags_name`, `hidden_tags`
	- Language explicitly requested → raise `language`
	- Otherwise: keep defaults

	Now process this input:
	PROMPT

	def translate_and_refine(text)
	conn = Faraday.new(url: GEMINI_URL) do \|f\|
	f.request :json
	f.response :json
	f.adapter Faraday.default_adapter
	end

	body = {
	contents: [{ parts: [{ text: PROMPT_TEMPLATE + text }] }],
	generationConfig: {
	maxOutputTokens: 300,
	temperature: 0.7,
	responseMimeType: "application/json",
	responseSchema: RESPONSE_SCHEMA
	}
	}

	response = conn.post("?key=#{GOOGLE_API_KEY}", body)
	raise "Gemini error: #{response.body}" unless response.success?

	raw = response.body.dig("candidates", 0, "content", "parts", 0, "text")
	result = JSON.parse(raw)

	# Fill any missing boost keys with defaults
	result["field_boosts"] = FIELD_BOOST_DEFAULTS.merge(result["field_boosts"] \|\| {})
	result
	end

	if __FILE__ == $0
	queries = [
	"stories about friendship and sharing for kids",
	"show me a Hindi story about a brave little girl who saves her village",
	"story where the hero fights a scary monster",
	"a story about a drug dealer",
	"story with graphic murder and gore"
	]

	queries.each do \|query\|
	result = translate_and_refine(query)
	score = result["safety_score"].to_f
	icon = score >= 0.7 ? "🟢" : (score >= 0.4 ? "🟡" : "🔴")

	puts "#{icon} safety=#{format('%.2f', score)} \| subject=#{result['subject']}"
	puts " query : #{query}"
	puts " boosts: #{result['field_boosts']}"
	puts
	end
	end