This walkthrough describes the downstream processing and analysis of 2bRAD reads generated from Montastraea cavernosa samples collected across urbanized and reef habitats in southeast Florida. Sequence alignments are based on an existing genome assembly by John Rippe and Misha Matz: https://doi.org/10.1111/mec.15931. For initial processing of 2bRAD reads regardless of species, please see below:
Library prep, bioinformatics, and analysis protocols are credited to the 2bRAD pipeline originally developed by Misha Matz: https://doi.org/10.1038/nmeth.2023, and further refined by Ryan Eckert: https://ryaneckert.github.io/Stephanocoenia_FKNMS_PopGen/code/
mkdir ~/db/symGenomes
cd ~/db/symGenomes
Using Symbiodiniaceae references from NCBI’s most recent genomes: Symbiodinium (NCBI GCA_965279495.1), Breviolum (GCA_965643015.1), Cladocopium (GCA_947184155.2), Durusdinium (GCA_963970005.1)
python concatFasta.py -o symbConcatGenome.fasta -s symbConcatGenome_summary.tsv -m symbConcatGenome_contig_to_fakechr.tsv GCA_965279495.1_pySymTrid1.1_genomic.fna GCA_965643015.1_pyBreMinu3.1_genomic.fna GCA_947184155.2_Cgoreaui_SCF055-01_v2.1_genomic.fna GCA_963970005.1_Durusdinium_trenchii_CCMP2556_genomic.fna
Building bowtie2 index for concatenated symbiont genomes
conda activate 2bRAD
echo '#!/bin/bash' >genomeBuild.sh
echo bowtie2-build symbConcatGenome.fasta symbConcatGenome >>genomeBuild.sh
echo samtools faidx symbConcatGenome.fasta >>genomeBuild.sh
sbatch -o genomeBuild.o%j -e genomeBuild.e%j --mail-type=ALL --mail-user=studivanms@gmail.com genomeBuild.sh
Building bowtie2 index for M. cavernosa genome
conda activate 2bRAD
echo '#!/bin/bash' >genomeBuild.sh
echo bowtie2-build GCA_965282425.1_jaDipLaby1.1_genomic.fna McavernosaGenome >>genomeBuild.sh
echo samtools faidx GCA_965282425.1_jaDipLaby1.1_genomic.fna >>genomeBuild.sh
sbatch -o genomeBuild.o%j -e genomeBuild.e%j --mail-type=ALL --mail-user=studivanms@gmail.com genomeBuild.sh
conda activate 2bRAD
SYMGENOME=~/db/symGenomes/symbConcatGenome
2bRAD_bowtie2_launcher.py -g $SYMGENOME -f trim -n zooxMaps --split -u un -a zoox --launcher -e studivanms@gmail.com
sbatch zooxMaps.slurm
mkdir ../mappedReads
mkdir ../mappedReads/symbionts
mv *.sam ../mappedReads/symbionts
mv *.zoox ../mappedReads/symbionts
cd ../mappedReads/symbionts
Do we have the correct number of files?
ll *.zoox | wc -l
Counting the mapped symbiont reads
echo '#!/bin/bash' >mappedZooxReads
echo readCounts.sh -e zoox -o Zoox >>mappedZooxReads
sbatch --mem=200GB --mail-type=ALL --mail-user=studivanms@gmail.com mappedZooxReads
scp ZooxReadCounts to local machine
Making script to generate indexed bam files
>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done
launcher_creator.py -j s2b -n s2b -t 6:00:00 -N 5 -e studivanms@gmail.com -q shortq7
sbatch s2b.slurm
Counting the symbiont reads by genera
>ZooxReads
for i in *.bam; do
echo $i >>ZooxReads;
samtools idxstats $i | cut -f 1,3 >>ZooxReads;
done
scp ZooxReads to local machine
Housekeeping for long-term storage
zipper.py -a -9 -f sam --launcher -e studivanms@gmail.com
sbatch zip.slurm
zipper.py -a -9 -f zoox --launcher -e studivanms@gmail.com
sbatch zip.slurm
cd ~/urban/mcav/filteredReads
conda activate 2bRAD
HOSTGENOME=~/db/mcavGenome/McavernosaGenome
mkdir junk
# mapping with --local option, enables clipping of mismatching ends (guards against deletions near ends of RAD tags)
2bRAD_bowtie2_launcher.py -g $HOSTGENOME -f un --split -u junk -a host --undir junk --launcher -e studivanms@gmail.com
sbatch --mem=200GB maps.slurm
Do we have the correct number of files?
ll *.sam | wc -l
Counting the mapped coral reads
echo '#!/bin/bash' >mappedReads
echo readCounts.sh -e host -o Host >>mappedReads
sbatch --mem=200GB --mail-type=ALL --mail-user=studivanms@gmail.com mappedReads
scp HostReadCounts to local machine
Making script to generate indexed bam files
>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done
launcher_creator.py -j s2b -n s2b -q shortq7 -t 06:00:00 -e studivanms@gmail.com
sbatch --mem=200GB s2b.slurm
ll *bam | wc -l
Housekeeping for long-term storage
zipper.py -a -9 -f sam --launcher -e studivanms@gmail.com
sbatch zip.slurm
zipper.py -a -9 -f un --launcher -e studivanms@gmail.com
sbatch zip.slurm
zipper.py -a -9 -f trim --launcher -e studivanms@gmail.com
sbatch zip.slurm
zipper.py -a -9 -f host --launcher -e studivanms@gmail.com
sbatch zip.slurm
mv *.trim.gz ../../trimmedReads
mv *.sam.gz ../mappedReads
mv *.host.gz ../mappedReads
mv *.bam ../mappedReads
mv *.bai ../mappedReads
cd junk
zipper.py -a -9 -f junk --launcher -e studivanms@gmail.com
sbatch zip.slurm
Doesn’t call actual genotypes, but instead generates genotype likelihoods at each SNP. Optimal for low-coverage data (<10x) like 2bRAD.
Copy all .bam files to a new directory ‘ANGSD’
mkdir ../ANGSD
cd ../ANGSD
mv ../mappedReads/*.bam* .
ls *bam >bamsClones
Assessing base qualities and coverage depth -minMapQ 20:
only highly unique mappings (prob of erroneous mapping =< 1%)
-baq 1: realign around indels (not terribly relevant for
2bRAD reads mapped with –local option) -maxDepth: highest
total depth (sum over all samples) to assess; set to 10x number of
samples -minInd: the minimal number of individuals the site
must be genotyped in. Set to 50% of total N at this stage.
export FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -maxDepth 1200 -minInd 60"
export TODO="-doQsDist 1 -doDepth 1 -doCounts 1 -dumpCounts 2"
echo '#!/bin/bash' >mcavDD.sh
echo angsd -b bamsClones -GL 1 $FILTERS $TODO -P 1 -out dd -nThreads 20 >>mcavDD.sh
sbatch --mem=200GB -o mcavDD.o%j -e mcavDD.e%j --mail-user=studivanms@gmail.com --mail-type=ALL --partition=shortq7 mcavDD.sh
Summarizing results
module load R/3.6.1
echo '#!/bin/bash' >RQC.sh
echo Rscript ~/bin/plotQC.R prefix=dd >>RQC.sh
echo gzip -9 dd.counts >>RQC.sh
sbatch -e RQC.e%j -o RQC.o%j --dependency=afterok:460550 --mem=200GB --mail-type=ALL --mail-user=studivanms@gmail.com RQC.sh
Proportion of sites covered at >5X:
cat quality.txt
scp dd.pdf to local machine to look at distribution of base quality
scores, fraction of sites in each sample passing coverage thresholds and
fraction of sites passing genotyping rates cutoffs. Use these to guide
choices of -minQ, -minIndDepth and
-minInd filters in subsequent ANGSD
runs.
Set the -minInd flag to 80% of the number of samples you
have
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 30 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 96 -snp_pval 1e-6 -minMaf 0.05"
TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doBcf 1 -doPost 1 -doGlf 2"
echo '#!/bin/bash' > mcavClones.sh
echo angsd -b bamsClones -GL 1 $FILTERS $TODO -P 1 -out mcavClones >>mcavClones.sh
sbatch --mem=200GB -o radClones.o%j -e radClones.e%j --mail-type=ALL --mail-user=studivanms@gmail.com --partition=shortq7 radClones.sh
scp .ibsMat and .bcf files to local machine Use ibs matrix to
identify clones with hierachial clustering in R.
scp to local machine and run chunk below in
R
pacman::p_load("dendextend", "ggdendro", "tidyverse")
cloneBams = read.csv("../../data/mcav/mcavMetadata.csv") # list of bam files
cloneMa = as.matrix(read.table("../../data/mcav/ANGSD/clones/mcavClones.ibsMat")) # reads in IBS matrix produced by ANGSD
dimnames(cloneMa) = list(cloneBams[,2],cloneBams[,2])
clonesHc = hclust(as.dist(cloneMa),"ave")
clonePops = cloneBams$region
cloneSite = cloneBams$site
cloneDend = cloneMa %>% as.dist() %>% hclust(.,"ave") %>% as.dendrogram()
cloneDData = cloneDend %>% dendro_data()
# Making the branches hang shorter so we can easily see clonal groups
cloneDData$segments$yend2 = cloneDData$segments$yend
for(i in 1:nrow(cloneDData$segments)) {
if (cloneDData$segments$yend2[i] == 0) {
cloneDData$segments$yend2[i] = (cloneDData$segments$y[i] - 0.01)}}
cloneDendPoints = cloneDData$labels
cloneDendPoints$region = clonePops[order.dendrogram(cloneDend)]
cloneDendPoints$site=cloneSite[order.dendrogram(cloneDend)]
rownames(cloneDendPoints) = cloneDendPoints$label
cloneDendPoints$region = as.factor(cloneDendPoints$region)
cloneDendPoints$site = as.factor(cloneDendPoints$site)
# Making points at the leaves to place symbols for regions
point = as.vector(NA)
for(i in 1:nrow(cloneDData$segments)) {
if (cloneDData$segments$yend[i] == 0) {
point[i] = cloneDData$segments$y[i] - 0.01
} else {
point[i] = NA}}
cloneDendPoints$y = point[!is.na(point)]
techReps = c("urban_104", "urban_104_2", "urban_177", "urban_177_2", "urban_177_3", "urban_276", "urban_276_2", "urban_276_3", "urban_429", "urban_429_2", "urban_429_3")
cloneDendPoints$site = factor(cloneDendPoints$site,levels(cloneDendPoints$site)[c(5, 13, 6, 4, 14, 10, 3, 12, 1, 7, 9, 8, 2, 15, 11)])
cloneDendPoints$site
cloneDendPoints$region = factor(cloneDendPoints$region,levels(cloneDendPoints$region)[c(2, 3, 4, 5, 1, 6)])
cloneDendPoints$region
Pal <- c("#E69F00", "#009E73", "#D55E00", "#CC79A7", "#E41A1C", "#377EB8", "#984EA3", "#A65628", "#F781BF", "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", "#B3B3B3")
cloneDendA = ggplot() +
geom_segment(data = segment(cloneDData), aes(x = x, y = y, xend = xend, yend = yend2), size = 0.5) +
geom_point(data = cloneDendPoints, aes(x = x, y = y, fill = site, shape = region), size = 4, stroke = 0.25) +
# scale_fill_brewer(palette = "Dark2", name = "Site") +
scale_fill_manual(values = Pal, name= "Site")+
scale_shape_manual(values = c(21, 22, 23, 24, 25, 8), name = "Region")+
geom_hline(yintercept = 0.135, color = "red", lty = 5, size = 0.75) + # creating a dashed line to indicate a clonal distance threshold
geom_text(data = subset(cloneDendPoints, subset = label %in% techReps), aes(x = x, y = (y - .045), label = label), angle = 90) + # spacing technical replicates further from leaf
geom_text(data = subset(cloneDendPoints, subset = !label %in% techReps), aes(x = x, y = (y - .025), label = label), angle = 90) +
labs(y = "Genetic distance (1 - IBS)") +
guides(fill = guide_legend(override.aes = list(shape = 22)))+
theme_classic()
cloneDend = cloneDendA + theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = 12, color = "black", angle = 90),
axis.text.y = element_text(size = 10, color = "black"),
axis.line.y = element_line(),
axis.ticks.y = element_line(),
panel.grid = element_blank(),
panel.border = element_blank(),
panel.background = element_rect(fill = "white", colour = NA),
plot.background = element_rect(fill = "white", colour = NA),
legend.key = element_blank(),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
legend.position = "bottom")
cloneDend
ggsave("../../figures/mcav/cloneDend.pdf", plot = cloneDend, height = 8, width = 24, units = "in", dpi = 300)
ggsave("../../figures/mcav/cloneDend.png", plot = cloneDend, height = 8, width = 24, units = "in", dpi = 300)
Set the -minInd flag to 80% of the number of samples you
have remaining after removal of clones
mkdir clones
mv mcavClones* clones
cat bamsClones | grep -v 'MCAV_urban_006.trim.un.bt2.bam\|MCAV_urban_062.trim.un.bt2.bam\|MCAV_urban_104.trim.un.bt2.bam\|MCAV_urban_115.trim.un.bt2.bam\|MCAV_urban_177_2.trim.un.bt2.bam\|MCAV_urban_177.trim.un.bt2.bam\|MCAV_urban_272.trim.un.bt2.bam\|MCAV_urban_276_2.trim.un.bt2.bam\|MCAV_urban_276_3.trim.un.bt2.bam\|MCAV_urban_277.trim.un.bt2.bam\|MCAV_urban_278.trim.un.bt2.bam\|MCAV_urban_279.trim.un.bt2.bam\|MCAV_urban_292.trim.un.bt2.bam\|MCAV_urban_429_3.trim.un.bt2.bam\|MCAV_urban_429.trim.un.bt2.bam' >bamsNoClones
module load angsd-0.933-gcc-9.2.0-65d64pp
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 30 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 84 -snp_pval 1e-6 -minMaf 0.05"
TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doBcf 1 -doPost 1 -doGlf 2"
echo '#!/bin/bash' > mcavNoClones.sh
echo angsd -b bamsNoClones -GL 1 $FILTERS $TODO -P 1 -out mcavNoClones >> mcavNoClones.sh
sbatch --mem=200GB -o radNoClones.o%j -e radNoClones.e%j -p shortq7 --mail-type=ALL --mail-user=studivanms@gmail.com radNoClones.sh
How many SNPs do we have?
grep "filtering:" mcavNoClones.e*
Number of sites retained after filtering: 12258
Some housekeeping
zipper.py -a -9 -f bam --launcher -e studivanms@gmail.com
sbatch zip.slurm
mkdir ../pcangsd
cd ../pcangsd
cp ../ANGSD/mcavNoClones.beagle.gz .
conda activate pcangsd
echo '#!/bin/bash' > pcangsd
echo 'pcangsd -b mcavNoClones.beagle.gz -o mcavPcangsd --admix --inbreed-samples --pcadapt --selection' >> pcangsd
launcher_creator.py -j pcangsd -n pcangsd -q shortq7 -t 06:00:00 -e $EMAIL -w 1 -N 1
sbatch pcangsd.slurm
Calculating population structure from genotype likelihoods using NGSadmix (part of ANGSD) Simulating (50x) different values of K (minimum of 1, maximum of the number of your sites + 3)
mkdir ../ngsAdmix
cp *beagle* ../ngsAdmix
cd ../ngsAdmix
ngsAdmixLauncher.py -f mcavNoClones.beagle.gz --maxK 18 -r 50 -n mcav --launcher -e studivanms@gmail.com
sbatch --mem=200GB mcavNgsAdmix.slurm
Taking the likelihood value from each run of NGSadmix and put them into a file that can be used with Clumpak to calculate the most likely K using the methods of Evanno et al. (2005).
>mcavNgsAdmixLogfile
for log in mcav*.log; do
grep -Po 'like=\K[^ ]+' $log >> mcavNgsAdmixLogfile;
done
Start R in command line
R
logs <- as.data.frame(read.table("mcavNgsAdmixLogfile"))
# output is organized with 10, 11 preceding 1, 2, 3 etc.
logs$K <- c(rep("10", 50), rep("11", 50), rep("12", 50), rep("13", 50), rep("14", 50), rep("15", 50), rep("16", 50), rep("17", 50), rep("18", 50), rep("1", 50), rep("2", 50), rep("3", 50), rep("4", 50), rep("5", 50), rep("6", 50), rep("7", 50), rep("8", 50), rep("9", 50))
write.table(logs[, c(2, 1)], "mcavNgsAdmixLogfile_formatted", row.names = F, col.names = F, quote = F)
quit()
# No need to save workspace image [press 'n']
n
Check that the formatted log file has the correct number of entries
cat mcavNgsAdmixLogfile_formatted | wc -l
900, so yes (K=18 * 50 simulations each)
Make copies of .qopt files to run structure selector on (.Q files)
for file in mcav*.qopt; do
filename=$(basename -- "$file" .qopt);
cp "$file" "$filename".Q;
done
mkdir mcavQ
mv mcav*Q mcavQ
zip -r mcavQ.zip mcavQ
scp .zip, formatted logfile, and mcavNoClones* to local machine
Create a PopMap file from bamsNoClones: a two-column, tab-delimited text file with sampleID and populationID Reorder output Q files and PopMap file based on specific site order
library(tidyverse)
popfile <- read.table(file = "../../data/mcav/ngsAdmix/mcavPopfile.txt", header = FALSE, col.names=c("sample", "site"), stringsAsFactors=FALSE)
site_order <- c(
"EmeraldReef",
"RainbowReef",
"FisherIsland",
"CoralCityCamera",
"StarIsland",
"MacArthurNorth",
"BelleIsle",
"PillarsReef",
"ArchCreek",
"FIUBiscayneBay",
"GracelandReef",
"FTL4Reef",
"BC1Reef",
"T328Reef",
"PeanutIsland"
)
site_factor <- factor(popfile$site, levels = site_order)
ord <- order(site_factor)
popfile_sorted <- popfile[ord, ]
write.table(popfile_sorted, "../../data/mcav/ngsAdmix/mcavPopfile_sorted.txt",
quote=FALSE, row.names=FALSE, col.names=FALSE, sep="\t")
# Unzips, loops through, and reorders each .Q file based on popfile_sorted
unzip("../../data/mcav/ngsAdmix/mcavQ.zip", exdir = "../../data/mcav/ngsAdmix/")
q_dir <- "../../data/mcav/ngsAdmix/mcavQ" # where .Q files now live
out_dir <- "../../data/mcav/ngsAdmix/mcavQ_sorted" # new folder for reordered files
dir.create(out_dir, showWarnings = FALSE)
q_files <- list.files(q_dir, pattern="\\.Q$|\\.qopt$", full.names=TRUE)
length(q_files) # should be 800
for (file in q_files) {
q <- read.table(file, header=FALSE, colClasses = "character")
# must have same number of rows
stopifnot(nrow(q) == nrow(popfile_sorted))
q_sorted <- q[ord, ]
out_file <- file.path(out_dir, basename(file))
write.table(
q_sorted,
out_file,
quote = FALSE,
row.names = FALSE,
col.names = FALSE
)
}
# Cleanup
unlink(q_dir, recursive = TRUE) # Delete the uncompressed mcavQ directory
# Zip the mcavQ_sorted directory
out_dir_norm <- normalizePath(out_dir)
parent_dir <- dirname(out_dir_norm)
folder_name <- basename(out_dir_norm)
zipfile_path <- file.path(parent_dir, "mcavQ_sorted.zip")
cmd <- sprintf(
'cd "%s" && zip -r "%s" "%s"',
parent_dir,
basename(zipfile_path),
folder_name
)
cat("Running command:\n", cmd, "\n")
system(cmd)
if (file.exists(zipfile_path)) {
unlink(out_dir_norm, recursive = TRUE)
} # Delete the uncompressed mcavQ_sorted directory
Upload logfile to CLUMPAK as a Log Probability table file (http://clumpak.tau.ac.il/bestK.html) And .zip to structure selector as an ADMIXTURE file (https://lmme.qdio.ac.cn/stable/StructureSelector/index.html)
If Clumpak fails due to all simulations of K=1 having zero standard deviation (all models having the same log likelihood), your populations may be acting as a single unit (panmixia) We can run the Evanno method another way to determine if K=1 is the most likely scenario, and can evaluate potential cryptic genetic lineages
# Load libraries
library(ggplot2)
library(dplyr)
library(patchwork)
# -------------------------
# 1. Read space-delimited input file
# -------------------------
data_file <- "../../data/mcav/ngsAdmix/mcavNgsAdmixLogfile_formatted" # replace with your file
df <- read.table(data_file, header = FALSE, sep = "",
col.names = c("K", "LogLikelihood"), stringsAsFactors = FALSE)
# -------------------------
# 2. Clean columns and convert to numeric
# -------------------------
df$K <- as.numeric(trimws(df$K))
df$LogLikelihood <- as.numeric(trimws(df$LogLikelihood))
df <- df[!is.na(df$K) & !is.na(df$LogLikelihood), ]
if(nrow(df) == 0) stop("No valid data found in the file. Check file formatting.")
# -------------------------
# 3. Summarize mean and SD for each K
# -------------------------
summary_df <- df %>%
group_by(K) %>%
summarise(MeanLnP = mean(LogLikelihood, na.rm = TRUE),
SD = sd(LogLikelihood, na.rm = TRUE)) %>%
arrange(K)
# -------------------------
# 4. Calculate ΔK robustly
# -------------------------
L <- summary_df$MeanLnP
n <- nrow(summary_df)
deltaK <- rep(NA, n)
if(n >= 3) {
for(i in 2:(n-1)) {
Li_minus1 <- L[i-1]; Li <- L[i]; Li_plus1 <- L[i+1]; SDi <- summary_df$SD[i]
if(all(is.finite(c(Li_minus1, Li, Li_plus1, SDi))) && SDi != 0) {
deltaK[i] <- abs(Li_plus1 - 2*Li + Li_minus1) / SDi
}
}
}
summary_df$deltaK <- deltaK
# -------------------------
# 5. Identify all ΔK peaks
# -------------------------
is_peak <- rep(FALSE, n)
if(n >= 3 && any(!is.na(deltaK))) {
for(i in 2:(n-1)) {
if(!is.na(deltaK[i])) {
left <- deltaK[i-1]; right <- deltaK[i+1]
if((is.na(left) || deltaK[i] > left) && (is.na(right) || deltaK[i] > right)) {
is_peak[i] <- TRUE
}
}
}
}
peaks_df <- summary_df[is_peak, ]
# Identify global best K (largest ΔK)
global_best <- if(nrow(peaks_df) > 0) peaks_df[which.max(peaks_df$deltaK), ] else NULL
# -------------------------
# 6. Create plots
# -------------------------
# ΔK plot
if(!all(is.na(summary_df$deltaK))) {
p_deltaK <- ggplot(summary_df, aes(x = K, y = deltaK)) +
geom_line(color = "steelblue") +
geom_point() +
theme_minimal() +
labs(title = "ΔK Plot (Evanno Method)",
x = "K",
y = expression(Delta*K)) +
coord_cartesian(clip = "off") # prevent label clipping
if(nrow(peaks_df) > 0) {
p_deltaK <- p_deltaK +
geom_point(data = peaks_df, aes(x = K, y = deltaK), color = "red", size = 3) +
geom_text(data = peaks_df, aes(x = K, y = deltaK, label = paste("K =", K)),
vjust = -0.5, color = "red", fontface = "bold", clip = "off")
}
if(!is.null(global_best)) {
p_deltaK <- p_deltaK +
geom_point(data = global_best, aes(x = K, y = deltaK), color = "darkred", size = 5) +
geom_text(data = global_best, aes(x = K, y = deltaK, label = paste("Best K =", K)),
vjust = -1, color = "darkred", fontface = "bold", clip = "off")
}
} else {
p_deltaK <- NULL
}
# Mean LogLikelihood plot
p_mean <- ggplot(summary_df, aes(x = K, y = MeanLnP)) +
geom_line(color = "darkgreen") +
geom_point(size = 3) +
theme_minimal() +
labs(title = "Mean Log Likelihood vs K",
x = "K",
y = "Mean Log Likelihood") +
coord_cartesian(clip = "off")
# Combine plots vertically
if(!is.null(p_deltaK)) {
p_combined <- p_deltaK / p_mean + plot_layout(heights = c(1,1))
} else {
p_combined <- p_mean
}
# -------------------------
# 7. Save combined plot
# -------------------------
output_file <- "../../figures/mcav/DeltaK_and_MeanLnP_plot.png"
ggsave(output_file, plot = p_combined, width = 6, height = 8, dpi = 300)
# -------------------------
# 8. Print peak info
# -------------------------
cat("Plot saved as", output_file, "\n")
if(!is.null(global_best)) {
cat("ΔK peaks at K =", paste(peaks_df$K, collapse = ", "), "\n")
cat("ΔK values:", paste(round(peaks_df$deltaK, 2), collapse = ", "), "\n")
cat("Global best K =", global_best$K, "with ΔK =", round(global_best$deltaK, 2), "\n")
} else if(nrow(summary_df) > 0) {
cat("ΔK could not be calculated; showing Mean Log Likelihood plot.\n")
} else {
cat("No valid data to plot.\n")
}