## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", message = FALSE ) ## ----setup, warning = FALSE, message = FALSE---------------------------------- library(baserater) library(tidyverse) library(knitr) ## ----------------------------------------------------------------------------- # Load the base-rate database database <- download_data("database") # Load the typicality validation ratings ratings <- download_data("validation_ratings") # Load the typicality matrices gpt4_matrix <- download_data("typicality_matrix_gpt4") llama3_3_matrix <- download_data("typicality_matrix_llama3.3") # Load the group-adjective pairs material <- download_data("material") ## ----message=FALSE, warning = FALSE, echo = FALSE----------------------------- # Load pre-generated scores new_scores <- readRDS(system.file("extdata", "new_typicality_scores_llama3.1_8B.rds", package = "baserater")) ## ----eval = FALSE------------------------------------------------------------- # # Original prompt from the paper # original_system_prompt_content <- "You are expert at accurately reproducing the stereotypical associations humans make, in order to annotate data for experiments. Your focus is to capture common societal perceptions and stereotypes, rather than factual attributes of the groups, even when they are negative or unfounded." # # original_user_prompt_content_template <- "Rate how well the adjective '{description}' reflects the prototypical member of the group '{group}' on a scale from 0 ('Not at all') to 100 ('Extremely'). # # To clarify, consider the following examples: # # 1. 'Rate how well the adjective FUNNY reflects the prototypical member of the group CLOWN on a scale from 0 (Not at all) to 100 (Extremely).' A high rating is expected because the adjective 'FUNNY' closely aligns with the typical characteristics of a 'CLOWN'. # # 2. 'Rate how well the adjective FEARFUL reflects the prototypical member of the group FIREFIGHTER on a scale from 0 (Not at all) to 100 (Extremely).' A low rating is expected because the adjective 'FEARFUL' diverges significantly from the typical characteristics of a 'FIREFIGHTER'. # # 3. 'Rate how well the adjective PATIENT reflects the prototypical member of the group ENGINEER on a scale from 0 (Not at all) to 100 (Extremely).' A mid-scale rating is expected because the adjective 'PATIENT' neither closely aligns nor diverges significantly from the typical characteristics of an 'ENGINEER'. # # Your response should be a single score between 0 and 100, with no additional text, letters, or symbols included." # # # # Example using the validation ratings # groups <- ratings$group # descriptions <- ratings$adjective # # api_token <- Sys.getenv("PROVIDER_API_TOKEN") # # new_scores <- generate_typicality( # groups = groups, # descriptions = descriptions, # api_url = "https://api.together.xyz/v1/chat/completions", # example for 'Together AI' API # api_token = api_token, # model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", # model name on Together AI # n = 3, # number of responses to generate # min_valid = 2, # minimum number of valid responses; mean of valid ones is used # max_tokens = 3, # numeric output between 0 and 100 # retries = 2, # number of retries in case of API errors # matrix = FALSE, # return_raw_scores = TRUE, # return_full_responses = TRUE, # verbose = TRUE) ## ----warning = FALSE---------------------------------------------------------- knitr::kable(head(new_scores)) ## ----------------------------------------------------------------------------- # Distribution of new typicality ratings ggplot(new_scores, aes(x = mean_score)) + geom_histogram(binwidth = 5, fill = "steelblue", color = "white") + labs( title = "Distribution of Typicality Ratings from 'LLaMA 3.1-8B-Instruct'", x = "Typicality Rating", y = "Count" ) + theme_classic() ## ----warning = FALSE---------------------------------------------------------- # Create a data frame with the same structure as the validation set new_scores = new_scores %>% mutate(adjective = description, rating = mean_score) %>% select(group, adjective, rating) knitr::kable(head(new_scores)) ## ----------------------------------------------------------------------------- # Join human and model scores comparison_df <- left_join( ratings %>% select(group, adjective, human = mean_human_rating), new_scores, by = c("group", "adjective") ) # Scatterplot ggplot(comparison_df, aes(x = rating, y = human)) + geom_point(alpha = 0.6) + geom_smooth(method = "lm", se = FALSE, color = "darkred") + labs( title = "Scatterplot of 'LLaMA 3.1' and Human Typicality Ratings", y = "Average Human Rating", x = "Average 'LLaMA 3.1' Rating" ) + theme_classic() ## ----------------------------------------------------------------------------- # Print correlation summary with human ground truth and baselines knitr::kable(evaluate_external_ratings(new_scores)) # Optionally store the output in a variable results <- evaluate_external_ratings(new_scores) ## ----warning = FALSE---------------------------------------------------------- #' The typicality matrix from 'GPT-4' is a data frame with group–adjective pairs and their typicality scores gpt4_matrix <- download_data("typicality_matrix_gpt4") knitr::kable(head(gpt4_matrix)) ## ----------------------------------------------------------------------------- #' Extract base-rate items from the typicality matrix base_rate_items <- extract_base_rate_items(gpt4_matrix) ## ----warning = FALSE---------------------------------------------------------- # View top base-rate items by stereotype strength knitr::kable(base_rate_items %>% arrange(desc(StereotypeStrength)) %>% head(10)) ## ----------------------------------------------------------------------------- # Pick one adjective and extract group typicality scores df <- gpt4_matrix %>% select(group, selfish) %>% rename(score = selfish) %>% arrange(desc(score)) # sort by how typical the group is # Save group names and their scores group_order <- df$group typ_values <- df$score names(typ_values) <- df$group # Build all group pairs and compute log-ratios res_df <- expand.grid( g1 = group_order, g2 = group_order, KEEP.OUT.ATTRS = FALSE ) %>% mutate( typ1 = typ_values[as.character(g1)], typ2 = typ_values[as.character(g2)], log_ratio = log(pmax(typ1, 1e-9) / pmax(typ2, 1e-9)) ) %>% mutate( g1 = factor(g1, levels = group_order), g2 = factor(g2, levels = group_order) ) # Keep only pairs where g1 is ranked higher than g2 and log-ratio is positive res_df <- res_df %>% filter(as.integer(g1) < as.integer(g2), log_ratio > 0) # Add identity pairs (g1 == g2) with NA diag_df <- tibble( g1 = factor(group_order, levels = group_order), g2 = factor(group_order, levels = group_order), log_ratio = NA_real_ ) # Combine with original filtered upper-triangle pairs res_df <- bind_rows(res_df, diag_df) # Plot the heatmap ggplot(res_df, aes(x = g2, y = g1, fill = log_ratio)) + geom_tile(color = "white") + scale_fill_gradient2( low = "steelblue", mid = "white", high = "firebrick", midpoint = 0, na.value = "grey90", name = "Log(Group 1 / Group 2)", ) + labs( title = "Stereotype Strength for Adjective 'Selfish'", x = "Group 2", y = "Group 1" ) + theme_classic() + theme( axis.text.y = element_text(angle = 15, hjust = 1, size = 5), axis.text.x = element_text(angle = 45, hjust = 1, size = 5), panel.grid = element_blank() )