color = "white", size = 6, fontface = "bold") +
labs(
title = "QTL Matches: Percentage of Replicates Where QTL was Found",
#subtitle = "Percentage of Replicates Containing Each QTL and Avg SNPs Per Replicate",
x = "Query SNP",
y = "Percentage of Replicates Found") +
scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 25), expand = c(0.1, 0.1)) +
theme_minimal() +
theme(
# Title and subtitle
plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 18, face = "italic", hjust = 0.5),
# Axes labels
axis.text.x = element_text(angle = 0, size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 16, face = "bold"),
axis.title.y = element_text(size = 16, face = "bold"))
# Compute the number of QTL hits per replicate
qtl_hits_per_replicate <- sapply(results, function(x) {
if (!is.null(x$qtl_results$matches)) {
length(unique(x$qtl_results$matches$Query_SNP))  # Count unique QTL matches
} else {0} })  # If no matches, count as 0
# Compute accuracy for each replicate
qtl_accuracy_per_replicate <- qtl_hits_per_replicate / 4  # Since each replicate has 4 targets
# Compute overall average accuracy across all replicates
mean_qtl_accuracy <- mean(qtl_accuracy_per_replicate, na.rm = TRUE)
# Create a data frame for visualization
qtl_accuracy_df <- data.frame(QTL_Hits = qtl_hits_per_replicate) # Number of QTLs found per replicate
# Define bin labels (0 to 4 QTL hits)
qtl_hit_levels <- factor(0:4, levels = 0:4)
# Ensure all possible bins (0 to 4) are represented
qtl_accuracy_df <- qtl_accuracy_df %>%
group_by(QTL_Hits) %>%
summarize(Count = n(), .groups = "drop") %>%
complete(QTL_Hits = 0:4, fill = list(Count = 0))  # Fill missing bins with 0 count
# Convert to factor for proper binning
qtl_accuracy_df$QTL_Hits <- factor(qtl_accuracy_df$QTL_Hits, levels = 0:4)
ggplot(qtl_accuracy_df, aes(x = QTL_Hits, y = Count)) +
geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
geom_text(aes(label = Count), vjust = 0, size = 5, fontface = "bold") +
labs(
title = "Histogram of QTL Accuracy Across Replicates",
x = "Number of QTLs Found (out of 4)",
y = "Number of Replicates") +
theme_minimal() +
theme(
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 16, face = "bold"),
axis.title.y = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 18, face = "bold", hjust = 0.5)) +
scale_y_continuous(limits = c(0, reps), breaks = seq(0, reps, by = 10), expand = c(0, 0)) +
annotate("text", x = 3, y = reps * 0.95,
label = sprintf("Mean Accuracy: %.2f", mean_qtl_accuracy),
size = 6, fontface = "bold", color = "red")
# Bin SNPs by their base pair difference from QTL (keeping upstream (-) and downstream (+))
heatmap_data <- combined_qtl_matches %>%
mutate(Relative_Pos = bp - as.numeric(str_split_fixed(Query_SNP, "\\.", 2)[, 2])) %>%  # Compute SNP position relative to QTL
mutate(Bin = round(Relative_Pos / bin_size) * bin_size) %>%  # Bin into 100kb intervals (keeping negative values)
group_by(Query_SNP, Bin) %>%  # **DO NOT group by `Relative_Pos`**
summarize(
SNP_Count = n(),
Relative_Pos = mean(Relative_Pos, na.rm = TRUE),  # Retain `Relative_Pos` for reference
.groups = "drop")
# Ensure all bins exist for each QTL, filling missing bins with 0 SNPs
heatmap_data <- heatmap_data %>%
group_by(Query_SNP) %>%
complete(Bin = seq(min(Bin), max(Bin), by = bin_size), fill = list(SNP_Count = 0)) %>%
ungroup() %>%
mutate(Relative_Pos = ifelse(is.na(Relative_Pos), Bin, Relative_Pos))  # Ensure Relative_Pos is not NA
# **Set the correct top-to-bottom order for facet wrapping**
fixed_snp_order <- c("1.106956497", "1.281489331", "10.84021443", "18.27355039")
# Convert Query_SNP to a factor with the specified order
heatmap_data$Query_SNP <- factor(heatmap_data$Query_SNP, levels = fixed_snp_order)
# Create a dataset that retains each SNP's actual position without summarization
distance_data <- combined_qtl_matches %>%
mutate(
Relative_Pos = bp - as.numeric(str_split_fixed(Query_SNP, "\\.", 2)[, 2]),  # Compute SNP position relative to QTL
) %>%
select(Query_SNP, Relative_Pos)
# Convert Query_SNP to character before applying factor levels
distance_data <- distance_data %>%
mutate(Query_SNP = factor(as.character(Query_SNP), levels = fixed_snp_order))
# Order the data for plotting
distance_data <- distance_data %>%
arrange(factor(Query_SNP, levels = fixed_snp_order), Relative_Pos)
# Extract Feature_Base, Inheritance Model, and Replicate Identifier
non_ld_pruned_features <- do.call(rbind, lapply(results, function(x) {
if (!is.null(x$overall_feature_importance)) {
x$overall_feature_importance %>%
mutate(
Feature_Base = gsub("_[a-zA-Z]+$", "", Feature),  # Remove inheritance model suffix
Inheritance_Model = sub(".*_", "", Feature),  # Extract inheritance model
Replicate_ID = basename(x$overall_feature_importance$Replicate))  # Extract replicate identifier
} else {NULL}}))
# Compute Within-Replicate Average PFI per Feature
within_replicate_pfi <- lapply(results, function(x) {
x$pareto_features %>%
group_by(Feature, Replicate) %>%
summarize(Avg_PFI = mean(PFI_importance, na.rm = TRUE), .groups = "drop")})
within_replicate_pfi <- do.call(rbind, within_replicate_pfi)
# Compute Grand Average and SD of PFI Importance across Replicates
grand_pfi_summary <- within_replicate_pfi %>%
mutate(Feature_Base = gsub("_[a-zA-Z]+$", "", Feature)) %>%  # Match Feature_Base format
group_by(Feature_Base) %>%
summarize(
Avg_PFI_importance = mean(Avg_PFI, na.rm = TRUE),
SD_PFI_importance = sd(Avg_PFI, na.rm = TRUE),
PFI_Replicate_Count = n(),
.groups = "drop"
) %>%
mutate(
SD_PFI_importance = replace_na(SD_PFI_importance, 0),
SE_PFI_importance = ifelse(PFI_Replicate_Count > 1, SD_PFI_importance / sqrt(PFI_Replicate_Count), 0))
# Aggregate replicate IDs for each SNP
replicate_info <- non_ld_pruned_features %>%
group_by(Feature_Base) %>%
summarize(Replicate_IDs = paste(unique(Replicate_ID), collapse = ","), .groups = "drop")
# Compute Average Overall Feature Importance & Track Occurrences
feature_summary <- non_ld_pruned_features %>%
group_by(Feature_Base, Inheritance_Model) %>%
summarize(
Avg_Overall_Feature_Importance = mean(Overall_Feature_Importance, na.rm = TRUE),  # Average Importance per model
SD_Overall_Feature_Importance = sd(Overall_Feature_Importance, na.rm = TRUE),    # Standard deviation
Count = n(),  # Count of occurrences of each inheritance model for that SNP
.groups = "drop") %>%
mutate(
SD_Overall_Feature_Importance = replace_na(SD_Overall_Feature_Importance, 0),  # Replace NA SD with 0
SE_Overall_Feature_Importance = ifelse(Count > 1, SD_Overall_Feature_Importance / sqrt(Count), 0))  # SE is SD/sqrt(n), but if n=1, set SE = 0
# Merge PFI summary into feature_summary
feature_summary <- feature_summary %>%
left_join(grand_pfi_summary, by = "Feature_Base")
# Identify the Most Common Inheritance Model for Each SNP
most_common_inheritance <- feature_summary %>%
group_by(Feature_Base) %>%
arrange(Feature_Base, desc(Count), desc(Avg_Overall_Feature_Importance)) %>%  # Sort by count & importance
slice(1) %>%  # Take the first row (most common model)
select(Feature_Base, Most_Common_Inheritance = Inheritance_Model) %>%
ungroup()
# Compute the Proportion of Replicates Each SNP Appears In
replicate_counts <- non_ld_pruned_features %>%
group_by(Feature_Base) %>%
summarize(
Replicate_Count = n(),  # Count of replicates the SNP appeared in
Replicate_Proportion = n() / length(results),  # Proportion for reference
.groups = "drop")
# Merge Everything into Final Table
non_ld_pruned_features <- non_ld_pruned_features %>%
group_by(Feature_Base) %>%
summarize(
Average_Overall_Feature_Importance = mean(Overall_Feature_Importance, na.rm = TRUE),
SD_Feature_Importance = sd(Overall_Feature_Importance, na.rm = TRUE),
Count = n(),  # Count occurrences per feature
.groups = "drop") %>%
mutate(
SD_Feature_Importance = replace_na(SD_Feature_Importance, 0),  # Replace NA SD with 0
SE_Feature_Importance = ifelse(Count > 1, SD_Feature_Importance / sqrt(Count), 0)  # Set SE = 0 when Count = 1
) %>%
left_join(most_common_inheritance, by = "Feature_Base") %>%
left_join(replicate_counts, by = "Feature_Base") %>%
arrange(desc(Average_Overall_Feature_Importance))  # Sort by importance
# Select Top 50 for Plotting
top_non_ld_pruned_features <- non_ld_pruned_features %>% slice_head(n = topfeats)
# Extract Chromosome and Base Pair Positions for top non LD pruned and non LD pruned features
top_non_ld_pruned_features <- top_non_ld_pruned_features %>%
mutate(
chr = as.integer(str_extract(Feature_Base, "^[^.]+")),  # Extract chromosome
bp = as.numeric(str_extract(Feature_Base, "(?<=\\.)[0-9]+"))) # Extract base pair position
non_ld_pruned_features <- non_ld_pruned_features %>%
mutate(
chr = as.integer(str_extract(Feature_Base, "^[^.]+")),  # Extract chromosome
bp = as.numeric(str_extract(Feature_Base, "(?<=\\.)[0-9]+")))  # Extract base pair position
# Compute LD-Linked SNP Count with Unique Replicate Tracking
ld_linked_counts <- expand.grid(
Feature_Base_A = top_non_ld_pruned_features$Feature_Base,
Feature_Base_B = non_ld_pruned_features$Feature_Base) %>%  # Compare against all SNPs
left_join(non_ld_pruned_features %>% select(Feature_Base, chr, bp),
by = c("Feature_Base_B" = "Feature_Base")) %>%
left_join(top_non_ld_pruned_features %>% select(Feature_Base, chr, bp),
by = c("Feature_Base_A" = "Feature_Base"), suffix = c("_A", "_B")) %>%
filter(
chr_A == chr_B,  # Same chromosome
abs(bp_A - bp_B) <= LDrange,  # Within LD range
Feature_Base_A != Feature_Base_B) %>%  # Exclude self-comparison
left_join(replicate_info, by = c("Feature_Base_B" = "Feature_Base")) %>%  # Get replicate IDs for LD-linked SNPs
group_by(Feature_Base_A) %>%
summarize(
LD_Linked_Count = n_distinct(Feature_Base_B),  # Unique LD-linked SNPs
Unique_Replicate_Count = n_distinct(unlist(strsplit(paste(Replicate_IDs, collapse = ","), ","))),  # Unique replicate IDs
.groups = "drop")
# Merge LD-Linked SNP Count and Unique Replicate Count into Top 20 SNP Table
top_non_ld_pruned_features <- top_non_ld_pruned_features %>%
left_join(ld_linked_counts, by = c("Feature_Base" = "Feature_Base_A")) %>%
mutate(
LD_Linked_Count = replace_na(LD_Linked_Count, 1),  # If no LD-linked SNPs, set to 1
Unique_Replicate_Count = replace_na(Unique_Replicate_Count, 1),  # If no unique replicates found, set to 1
LD_Linked_Label = paste0("LD_P: ", LD_Linked_Count, " in ", Unique_Replicate_Count, " reps"))  # Final label update
# Define the mapping for inheritance model to superscript numbers
inheritance_map <- c("additive" = 1, "subadd" = 2, "superadd" = 3,
"recessive" = 4, "dominant" = 5, "heterosis" = 6,
"underdominant" = 7, "overdominant" = 8, "pager" = 9)
# Apply the mapping to generate superscript numbers for inheritance models
top_non_ld_pruned_features <- top_non_ld_pruned_features %>%
mutate(
Inheritance_Superscript = inheritance_map[Most_Common_Inheritance],  # Map inheritance model
Formatted_Feature = paste0(Feature_Base, " ", Replicate_Count))  # Add ratio
# ==== TOGGLE: Allow SNPs found in only one replicate? ====
allow_oneoffs <- FALSE  # Set to FALSE to exclude SNPs seen in only 1 replicate
# =========================================================
# Extract chromosome and base pair position for LD clustering
top_non_ld_pruned_features <- top_non_ld_pruned_features %>%
mutate(
chr = as.integer(str_split_fixed(Feature_Base, "\\.", 2)[, 1]),  # Extract chromosome
bp = as.numeric(str_split_fixed(Feature_Base, "\\.", 2)[, 2])) %>%    # Extract base pair
arrange(chr, bp) %>%
mutate(
SD_Feature_Importance = ifelse(is.na(SD_Feature_Importance), 0, SD_Feature_Importance),  # Replace NA SD with 0
SE_Feature_Importance = ifelse(is.na(SE_Feature_Importance), 0, SE_Feature_Importance))   # Replace NA SE with 0
# Cluster SNPs within 1Mb range in the Top 20 Features
clustered_features <- top_non_ld_pruned_features %>%
group_by(chr) %>%
arrange(bp, .by_group = TRUE) %>%  # Ensure sorting before clustering
mutate(
Cluster = cumsum(c(0, diff(bp) > LDrange)),  # Assign cluster IDs based on LD range
Global_Cluster = paste(chr, Cluster, sep = "_")) %>% # Create a globally unique cluster identifier
ungroup()
# STEP 1: Identify representative SNP per LD cluster
representative_snp <- clustered_features %>%
group_by(Global_Cluster) %>%
mutate(max_replicate_proportion = max(Replicate_Proportion, na.rm = TRUE)) %>%
filter(Replicate_Proportion == max_replicate_proportion) %>%
slice_max(Average_Overall_Feature_Importance, n = 1, with_ties = FALSE) %>%
select(Global_Cluster, Feature_Base)
# STEP 2: Pull metrics from top_non_ld_pruned_features
representative_info <- top_non_ld_pruned_features %>%
semi_join(representative_snp, by = "Feature_Base") %>%
left_join(representative_snp, by = "Feature_Base")  # adds Global_Cluster
representative_info <- representative_info %>%
left_join(grand_pfi_summary, by = "Feature_Base")
# STEP 3: Add cluster size
cluster_sizes <- clustered_features %>%
count(Global_Cluster, name = "Top_feat_Recurrence")
representative_info <- representative_info %>%
left_join(cluster_sizes, by = "Global_Cluster")
# STEP 4: Optionally filter out SNPs seen in only one replicate
if (!allow_oneoffs) {
representative_info <- representative_info %>%
filter(Replicate_Count > 1)}
# STEP 5: Assign inheritance superscripts and LD label
representative_info <- representative_info %>%
mutate(
Inheritance_Exponent = inheritance_map[Most_Common_Inheritance],
LD_Linked_Label = paste0("LD50: ", Top_feat_Recurrence))
# Use regex to separate base and ratio
representative_info <- representative_info %>%
mutate(
Feature_Label_Raw = str_extract(Formatted_Feature, "^[^ ]+"),              # SNP name before the space
Feature_Ratio_Label = str_extract(Formatted_Feature, "(?<= ).*")        # Content inside the parentheses including parens
)
# STEP 6: Format feature labels with superscripts and LD info
format_labels_ld <- function(feature, exponent) {
if (!is.na(exponent)) {
bquote(.(feature)^.(exponent))
} else {
bquote(.(feature))}}
# STEP 7: Create inheritance model superscript legend
legend_text_grob_ld <- ggdraw() +
draw_label("Inheritance Model\nSuperscript Legend:\n1 = Additive\n2 = Subadd\n3 = Superadd\n4 = Recessive\n5 = Dominant\n6 = Heterosis\n7 = Underdominant\n8 = Overdominant\n9 = Pager", x = 0, hjust = 0, size = 12, fontface = "bold")
# Create the label summary dataframe and save to CSV
label_info <- representative_info %>%
select(
Representative_SNP = Feature_Label_Raw,
LD_50_Recurrence = Top_feat_Recurrence,
Replicate_Count = Feature_Ratio_Label,
Mean_SNP_Consistency_Score = Average_Overall_Feature_Importance,
Mean_PFI = Avg_PFI_importance)
# Sort by consistency score, leave Feature_Base as character
rep_consistency <- representative_info %>%
arrange(desc(Average_Overall_Feature_Importance))
# Create superscripted labels using original Feature_Base names
formatted_labels_consistency <- mapply(
format_labels_ld,
rep_consistency$Feature_Base,
rep_consistency$Inheritance_Exponent,
SIMPLIFY = FALSE)
# Set names for label mapping
names(formatted_labels_consistency) <- rep_consistency$Feature_Base
# Plot
consistency_plot <- ggplot(rep_consistency,
aes(x = reorder(Feature_Base, Average_Overall_Feature_Importance),
y = Average_Overall_Feature_Importance)) +
geom_bar(stat = "identity", fill = "firebrick", alpha = 0.9, width = 0.6) +
geom_errorbar(aes(ymin = Average_Overall_Feature_Importance - SE_Feature_Importance,
ymax = Average_Overall_Feature_Importance + SE_Feature_Importance),
width = 0.2, color = "black") +
geom_text(aes(y = Average_Overall_Feature_Importance + SE_Feature_Importance,
label = sprintf("%.2f", Average_Overall_Feature_Importance)),
hjust = -0.2, size = 4, fontface = "bold") +
coord_flip() +
scale_x_discrete(labels = function(x) formatted_labels_consistency[x]) +
scale_y_continuous(limits = c(0, 1)) +
labs(
title = paste("Mean Consistency Scores for Most Common SNPs per LD Cluster in Top", topfeats, "SNPs"),
x = "SNP",
y = "Mean Consistency Score") +
theme_minimal() +
theme(
axis.text = element_text(size = 12),
axis.title = element_text(size = 14, face = "bold"),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
legend.position = "none")
# Combine with legend
consistency_final_plot <- plot_grid(consistency_plot, legend_text_grob_ld, ncol = 2, rel_widths = c(3, 0.55))
consistency_final_plot
# Combine SNP data across replicates
combined_snp_data <- do.call(rbind, lapply(results, function(x) {
if (!is.null(x$snp_hub_data)) {
x$snp_hub_data
} else {NULL} }))
# Verify combined SNP data
print(head(combined_snp_data))
# Add bin information based on LD range to combined SNP data
hotspot_data <- combined_snp_data %>%
mutate(Bin = floor(bp / LDrange)) %>% # Calculate bin for each SNP
group_by(chr, Bin) %>%
summarize(
SNP_Count = n(), # Count SNPs in each bin
.groups = "drop") # Ungroup after summarization
# Verify the hotspot data
print(head(hotspot_data))
# Visualize SNP Hotspots
ggplot(hotspot_data, aes(x = Bin, y = factor(chr, levels = rev(unique(chr))), fill = SNP_Count)) +
geom_tile(color = "white") + # Heatmap with white gridlines
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "SNP Count") +
labs(
title = "Genomic Hotspots of SNP Density",
x = "Genomic Position (1Mb intervals)",
y = "Chromosome") +
scale_y_discrete(expand = expansion(mult = c(0.05, 0.05))) + # Add padding
theme_minimal() +
theme(
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 16, face = "italic", hjust = 0.5),
legend.title = element_text(size = 14, face = "bold"),
legend.text = element_text(size = 12))
# Extract chr, bp, and map to bins
representative_info <- representative_info %>%
mutate(
Bin = floor(bp / LDrange)) %>%# Map positions to bins
select(-bp) # Drop intermediate column
# Order representative_info by feature importance BEFORE plotting
representative_info <- representative_info %>%
arrange(Average_Overall_Feature_Importance)  # Lower importance SNPs are plotted first
ggplot(hotspot_data, aes(x = Bin, y = factor(chr, levels = rev(unique(chr))), fill = SNP_Count)) +
geom_tile(color = "white") + # Heatmap with white gridlines
geom_point(
data = representative_info,
aes(x = Bin, y = factor(chr, levels = rev(unique(chr))), color = Average_Overall_Feature_Importance),
size = 6, inherit.aes = FALSE) +
geom_text(
data = representative_info,
aes(x = Bin, y = factor(chr, levels = rev(unique(chr))), label = Feature_Base),
vjust = -1, hjust = 1, size = 3.5, color = "black", inherit.aes = FALSE) + # Add labels for top features
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "SNP Count") +
scale_color_gradient(low = "blue", high = "red", limits = c(0, 0.7), breaks = seq(0, 0.70, by = 0.1), name = "Consistency Score") +
scale_x_continuous(
breaks = seq(0, max(hotspot_data$Bin, na.rm = TRUE), by = 50), # Increment x-axis by 50 bins
labels = seq(0, max(hotspot_data$Bin, na.rm = TRUE), by = 50)) +
scale_y_discrete(expand = expansion(mult = c(0.05, 0.05))) + # Add padding
labs(
title = "Genomic Hotspots of SNP Density with Top LD-Clustered Representative SNPs Highlighted",
x = "Genomic Position (1Mb intervals)",
y = "Chromosome") +
theme_minimal() +
theme(
axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title = element_text(size = 16, face = "bold"),
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 16, face = "italic", hjust = 0.5),
legend.title = element_text(size = 14, face = "bold"),
legend.text = element_text(size = 12))
# Combine feature importance data from all replicates
combined_feature_importance <- do.call(rbind, lapply(results, function(x) {
if (!is.null(x$overall_feature_importance)) {
x$overall_feature_importance %>%
mutate(
Feature = str_remove(Feature, "_[a-zA-Z]+$"),  # Remove _inheritance suffix
Replicate = str_remove(basename(x$overall_feature_importance$Replicate[1]), "Rep_")) # Clean Replicate name
} else {NULL} }))
# Count how many replicates each feature appears in
feature_replicate_counts <- combined_feature_importance %>%
group_by(Feature) %>%
summarize(n_replicates = n_distinct(Replicate), .groups = "drop")
# Filter out features that only appear in one replicate
combined_feature_importance <- combined_feature_importance %>%
inner_join(feature_replicate_counts %>% filter(n_replicates > 1), by = "Feature")
# Calculate chromosome extents from snp_hub_data
chr_extents <- snp_hub_data %>%
group_by(chr) %>%
summarize(
chr_start = min(bp, na.rm = TRUE), # Minimum base pair position
chr_end = max(bp, na.rm = TRUE), # Maximum base pair position
.groups = "drop") %>%
arrange(chr)
# Calculate cumulative offsets for sequential stacking
chr_offsets <- chr_extents %>%
mutate(cumulative_offset = lag(cumsum(as.numeric(chr_end - chr_start)), default = 0)) %>%
select(chr, cumulative_offset)
# Extract chromosome and base pair from the `Feature` column in combined_feature_importance
manhattan_data <- combined_feature_importance %>%
mutate(
chr = as.integer(str_split_fixed(Feature, "\\.", 2)[, 1]), # Extract chromosome
bp_with_tag = str_split_fixed(Feature, "\\.", 2)[, 2], # Extract base pair + tag
bp = as.numeric(str_remove(bp_with_tag, "_[a-zA-Z]+$"))) %>% # Remove inheritance tag
group_by(chr, bp) %>% # Group by chromosome and base pair
summarize(
Overall_Feature_Importance = mean(Overall_Feature_Importance, na.rm = TRUE), # Average importance for duplicates
.groups = "drop") %>%
arrange(chr, bp) # Arrange by chromosome and base pair
# Add cumulative base pair positions to the data
manhattan_data <- manhattan_data %>%
left_join(chr_offsets, by = "chr") %>%
mutate(Cumulative_BP = bp + cumulative_offset)
# Calculate midpoint of each chromosome for x-axis labels
chr_midpoints <- chr_extents %>%
left_join(chr_offsets, by = "chr") %>%
mutate(midpoint = (chr_start + chr_end) / 2 + cumulative_offset) %>%
select(chr, midpoint)
# Ensure chr is a factor for plotting
manhattan_data <- manhattan_data %>%
mutate(chr = factor(chr, levels = sort(unique(chr))))
# Define alternating colors for chromosomes
chromosome_colors <- rep(c("gray25", "purple2"), length.out = n_distinct(manhattan_data$chr))
# Plot the Manhattan data
ggplot(manhattan_data, aes(x = Cumulative_BP, y = Overall_Feature_Importance)) +
geom_point(aes(color = chr), size = 2, alpha = 1) +
scale_color_manual(values = chromosome_colors) +
scale_x_continuous(breaks = chr_midpoints$midpoint, labels = chr_midpoints$chr, expand = c(0.01, 0.01)) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
labs(
title = "Manhattan Plot of Mean Consistency Score per SNP Across Replicates",
x = "Chromosome",
y = "Mean Consistency Score") +
theme_minimal() +
theme(
axis.text.x = element_text(size = 14), # X-axis labels
axis.text.y = element_text(size = 14), # Y-axis labels
axis.title.x = element_text(size = 16, face = "bold"), # X-axis title
axis.title.y = element_text(size = 16, face = "bold"), # Y-axis title
plot.title = element_text(size = 18, face = "bold", hjust = 0.5), # Title
#plot.subtitle = element_text(size = 12, face = "italic", hjust = 0.5), # Subtitle
legend.position = "none",  # Remove the legend
panel.grid.major.x = element_blank(), # Remove major vertical grid lines
panel.grid.minor.x = element_blank(), # Remove minor vertical grid lines
panel.grid.minor.y = element_blank()) # Remove minor vertical grid lines
# Process all_pipelines data to create the pipeline summary
results <- lapply(seq_along(results), function(idx) {
replicate_name <- basename(list.dirs(base_dir, recursive = FALSE)[idx]) # Get replicate folder name
if (!is.null(results[[idx]]$all_pipelines)) {
# Extract relevant data from all_pipelines
pipeline_summary <- results[[idx]]$all_pipelines %>%
distinct(Pipeline_No, .keep_all = TRUE) %>% # Keep only one row per pipeline
select(
Pipeline_No,
Pipeline_R2,
Pipeline_Feature_Count,
Pipeline_Selector,
Pipeline_Root) %>%
mutate(Pipeline_Feature_Count_Prop = Pipeline_Feature_Count / branch_max) # Calculate feature count proportion
# Save the pipeline summary to a CSV file in the replicate folder
output_file <- file.path(base_dir, replicate_name, "pipeline_summary.csv")
write.csv(pipeline_summary, output_file, row.names = FALSE)
# Add pipeline_summary to the results object
results[[idx]]$pipeline_summary <- pipeline_summary
} else {
# If all_pipelines is NULL, add an empty pipeline_summary
results[[idx]]$pipeline_summary <- NULL}
return(results[[idx]])})
print("Pipeline summaries created and saved successfully.")
# Function to get unique chr_bin_num per replicate
get_unique_bins_per_replicate <- function(replicate_data) {
all_pipelines <- replicate_data$all_pipelines
snp_hub_data_file <- file.path(starbase_full_dir, starbase_replicate_name , "snp_hub.csv")
snp_hub_data <- read_csv(snp_hub_data_file, show_col_types = FALSE)
# Extract unique SNPs from 'Feature' column (removing _inheritance part)
unique_snps <- all_pipelines$Feature %>%
unique() %>%
str_remove("_.*") # Remove everything after and including the underscore
# Match with snp_hub_data and construct chr_bin_num
matched_snps <- snp_hub_data %>%
filter(snp %in% unique_snps) %>%
mutate(chr_bin_num = paste(chr, bin_num, sep = "_"))
# save matched_snps to a file
output_file <- file.path(base_dir, replicate_data$replicate_name, "bin_summary.csv")
write.csv(matched_snps, output_file, row.names = FALSE)
# Return the number of unique bins
return(n_distinct(matched_snps$chr_bin_num))
}
# Apply function to each replicate and collect bin counts
bin_counts <- sapply(results, get_unique_bins_per_replicate)
# store the bin counts with replicate number in a csv
bin_counts_df <- data.frame(Replicate = seq_along(bin_counts), Bin_Count = bin_counts)
write.csv(bin_counts_df, file = file.path(base_dir, "bin_counts.csv"), row.names = FALSE)
# View individual counts
bin_counts
# Average across replicates
mean_bin_count <- mean(bin_counts)
mean_bin_count
