Skip to content

Latest commit

 

History

History
1552 lines (1346 loc) · 49.1 KB

readme.org

File metadata and controls

1552 lines (1346 loc) · 49.1 KB

Mito Control Region

Vollmer mtCR sequences

  1. Download sequences from NCBI to ./vollmer.samples.fa
    ROOT=$(git rev-parse --show-toplevel)
    cat <(grep -v '^#' vollmer_2017.online_resource_7.tsv | cut -f 4 ) \
        <(grep -v '^#' vollmer_2021.SI-table-4.tsv | cut -f 7) |
        sort -u |
        ~/.local/edirect/efetch -db nuccore -format fasta > $DIR/vollmer.samples.fa
        

With Outgroup

Align

Combine datasets and trim names (includes the two outliers). Align with mafft

ROOT=$(git rev-parse --show-toplevel)
DIR=$ROOT/4-mtCR/

cat $DIR/vollmer.samples.fa \
    $ROOT/3-assemble/mtCR.fa \
    $DIR/outgroup.fa |
    sed '/>/s/ .*//' |
    $ROOT/apps/mafft-7.520/mafft.bat --auto --thread 4  \
        /dev/stdin > $DIR/1-mafft.fasta

Load libraries

library(adegenet)
library(haplotypes)
library(LEA)
library(tidyverse)
library(ape)
library(tidytree)
library(ggtree)
library(Biostrings)
library(hues)
library(hierfstat)
library(poppr)
library(ascii)
op <- options(asciiType = "org")

library(aplot)
options(width = 70)
sessionInfo()

Load metadata and color

colors <- list("2017"=c("NW.inner"="#98d4ed",
                        "E.inner"  =  "#f5c89b",
                        "NW.outer" =  "#bebfbf",
                        "E.outer"  =  "#e67383",
                        "NW.Oceanic"= "#e0e47e",
                        "E.oceanic" = "#c7a3c7",
                        "NE.oceanic"= "#99c796"),
               "2021"=c("green" =  "#669e40",
                        "blue"  =  "#2f5597"))

meta.counts <- mapply(left_join,
       list("2017"="vollmer_2017.online_resource_16.tsv",
       "2021"="vollmer_2021.SI-table-3.tsv") %>%
       lapply(read.delim, comment.char = '#') %>%
       lapply(gather, -Haplotype, key="Group", value='count') %>%
       lapply(select, Haplotype, Group, count),
       list("2017"="vollmer_2017.online_resource_7.tsv",
            "2021"="vollmer_2021.SI-table-4.tsv") %>%
       lapply(read.delim,sep="\t", skip=8) %>%
       lapply(select, Haplotype, "ID"=starts_with("GenBank")),
       SIMPLIFY = F) %>%
  bind_rows(.id="Set") %>%
  unique %>%
  mutate(ID=paste0(ID, ".1")) %>%
  filter(!grepl("Green.pop.subset", Group ) & count > 0) %>%
  mutate(Group = sub("Entire.(green|blue).pop.*", "\\1", Group))

Load and trim alignment (w/ outgroup)

fasta.with_outgroup <-
  read.dna("1-mafft.fasta.gz", format='fasta', as.matrix=T)
fasta.with_outgroup

image.DNAbin(fasta.with_outgroup)

Removing gaps at the start and end of the alignments

gap.counts = colSums(as.character(fasta.with_outgroup) == "-")
trim.range = range(which(gap.counts==0))

fasta.with_outgroup.trimmed =
  fasta.with_outgroup[,seq(trim.range[1], trim.range[2])]

fasta.with_outgroup.trimmed

Fix ambiguous bases

fasta.with_outgroup.trimmed.fixed <-
  solveAmbiguousBases(fasta.with_outgroup.trimmed)
fasta.with_outgroup.trimmed.fixed

Write trimmed fasta file

write.FASTA(fasta.with_outgroup.trimmed, "1-mafft.trimmed.fasta")
write.FASTA(fasta.with_outgroup.trimmed.fixed,
            "1-mafft.trimmed.fixed.fasta")

Haplotypes

haplotypes.with_outgroup <-
  fasta.with_outgroup.trimmed.fixed %>%
  as.dna %>%
  haplotype(indel='missing')

haplotypes.with_outgroup@freq %>%
  table %>%
  as.data.frame %>%
  setNames(c("Size", "Count")) %>%
  mutate(Size = as.character(Size)) %>%
  rbind(., c("Total", sum(.$Count)))

Rename haplotypes mtCR-new, mtCR-mix, and mtCR-pub for haplotypes containing only new sequences, mix of published and new, and only published.

haplotypes.new_names <-
  haplotypes.with_outgroup@haplist %>%
  lapply(as.data.frame) %>%
  bind_rows(.id='name') %>%
  setNames(c("Haplotype", "Sample")) %>%
  group_by(Haplotype) %>%
  mutate(New = as.numeric(any(grepl('^SER', Sample))),
         Pub = as.numeric(any(!grepl('^SER', Sample))) * 2,
         Out = any((Sample %in% c("DQ060054.1", "DQ060057.1")))) %>%
  mutate(NewName = as.character(factor(New+Pub, 1:3,
                                       labels = c("mtCR.new",
                                                  "mtCR.pub",
                                                  "mtCR.mix")))) %>%
  mutate(NewName = ifelse(Out, "mtCR.out", NewName),
         Size=n()) %>%
  select(Haplotype, NewName, Size) %>%
  unique() %>%
  rowid_to_column("tmp") %>%
  arrange(desc(Size)) %>%
  group_by(NewName) %>%
  mutate(NewName = sprintf("%s-%d", NewName, row_number())) %>%
  arrange(tmp) %>%
  select(-tmp)
haplotypes.new_names

names(haplotypes.with_outgroup@hapind) <- haplotypes.new_names$NewName
names(haplotypes.with_outgroup@haplist) <- haplotypes.new_names$NewName
rownames(haplotypes.with_outgroup@sequence) <-
  haplotypes.new_names$NewName

Write fasta

haplotypes.with_outgroup@sequence %>%
  as.DNAbin %>%
  write.FASTA("2-haplotypes.with_outgroup.fixed.fasta")

haplotypes.with_outgroup@sequence[
  !grepl("mtCR.out", names(haplotypes.with_outgroup@haplist)),
  ] %>%
  as.DNAbin %>%
  write.FASTA("2-haplotypes.without_outgroup.fixed.fasta")

Number and Size range of each type of haplotype

data.frame(Haplotype=names(haplotypes.with_outgroup@hapind),
           Size = haplotypes.with_outgroup@freq) %>%
  mutate(Haplotype = sub('-.*', '', Haplotype)) %>%
  group_by(Haplotype) %>%
  summarise(Count = n(),
            Range = sprintf("[%d - %d]",
                            min(Size),
                            max(Size)),
            .groups='drop') %>%
  ascii(digit=0, include.rownames=F)
HaplotypeCountRange
mtCR.mix12[2 - 198]
mtCR.new8[1 - 3]
mtCR.out2[1 - 1]
mtCR.pub53[1 - 4]

Published vs New haplotype overlap

haplotypes.with_outgroup@haplist %>%
  lapply(as.data.frame) %>%
  bind_rows(.id='name') %>%
  setNames(c("Haplotype", "Sample")) %>%
  group_by(Haplotype) %>%
  summarise(New = sum(grepl('^SER', Sample)),
            Pub = sum(!grepl('^SER', Sample)),
            Total = n()) %>%
  mutate(Haplotype = ifelse(Pub==0,
                            sprintf("*%s*", Haplotype),
                            Haplotype)) %>%
  arrange(desc(Total)) %>%
  ascii(digit=0, include.rownames=F)
HaplotypeNewPubTotal
mtCR.mix-11908198
mtCR.mix-265570
mtCR.mix-346854
mtCR.mix-4401050
mtCR.mix-522123
mtCR.mix-611112
mtCR.mix-7819
mtCR.mix-8415
mtCR.mix-9314
mtCR.pub-1044
mtCR.pub-2044
mtCR.mix-10123
mtCR.new-1303
mtCR.pub-3033
mtCR.pub-4033
mtCR.mix-11112
mtCR.mix-12112
mtCR.new-2202
mtCR.pub-5022
mtCR.pub-6022
mtCR.pub-7022
mtCR.pub-8022
mtCR.new-3101
mtCR.new-4101
mtCR.new-5101
mtCR.new-6101
mtCR.new-7101
mtCR.new-8101
mtCR.out-1011
mtCR.out-2011
mtCR.pub-10011
mtCR.pub-11011
mtCR.pub-12011
mtCR.pub-13011
mtCR.pub-14011
mtCR.pub-15011
mtCR.pub-16011
mtCR.pub-17011
mtCR.pub-18011
mtCR.pub-19011
mtCR.pub-20011
mtCR.pub-21011
mtCR.pub-22011
mtCR.pub-23011
mtCR.pub-24011
mtCR.pub-25011
mtCR.pub-26011
mtCR.pub-27011
mtCR.pub-28011
mtCR.pub-29011
mtCR.pub-30011
mtCR.pub-31011
mtCR.pub-32011
mtCR.pub-33011
mtCR.pub-34011
mtCR.pub-35011
mtCR.pub-36011
mtCR.pub-37011
mtCR.pub-38011
mtCR.pub-39011
mtCR.pub-40011
mtCR.pub-41011
mtCR.pub-42011
mtCR.pub-43011
mtCR.pub-44011
mtCR.pub-45011
mtCR.pub-46011
mtCR.pub-47011
mtCR.pub-48011
mtCR.pub-49011
mtCR.pub-50011
mtCR.pub-51011
mtCR.pub-52011
mtCR.pub-9011

Sequence differences between mtCR.new haplotypes and the most similar haplotype sequences

haplotypes.with_outgroup.dist <-
  haplotypes.with_outgroup@sequence %>%
  as.DNAbin %>%
  dist.dna(model = 'N', as.matrix = T) %>%
  as.data.frame %>%
  rownames_to_column("Hap1") %>%
  gather(-Hap1, key="Hap2", value="Dist")

haplotypes.of.interest <- haplotypes.with_outgroup.dist %>%
  mutate(tmp=Hap1, Hap1=Hap2, Hap2=tmp) %>%
  select(-tmp) %>%
  rbind(haplotypes.with_outgroup.dist) %>%
  filter(grepl("mtCR.new", Hap1)) %>%
  filter(Dist <=  1) %>%
  unique

haplotypes.with_outgroup@sequence %>%
  t %>%
  as.data.frame %>%
  rowid_to_column("Pos") %>%
  gather(-Pos, key="Hap2", value="base") %>%
  right_join(haplotypes.of.interest,
             relationship = 'many-to-many') %>%
  group_by(Hap1, Pos ) %>%
  filter(length(unique(base)) > 1) %>%
  ungroup %>%
  ggplot(aes(Pos, Hap2, color=base)) +
  geom_text(aes(label=base)) +
  facet_grid(rows=vars(Hap1), space = 'free', scales = 'free',
             switch='y') +
  theme_bw() +
  theme(strip.text.y.left  = element_text(angle = 0),
        strip.placement = 'outside',
        legend.position = 'none',
        axis.title = element_blank())

Plot pie chart

haplotypes.with_outgroup@freq %>%
  as.data.frame(nm='Size') %>%
  group_by(Size) %>%
  count %>%
  ggplot(aes(x=1, n, fill=factor(Size))) +
  geom_col() +
  coord_polar(theta = 'y', direction = -1) +
  scale_fill_iwanthue() +
  theme_minimal() +
  theme(axis.title = element_blank(),
        axis.text.y = element_blank())

Structure

  1. convert to genlight
    snps.without_outgroup <- fasta2genlight(
      "2-haplotypes.without_outgroup.fixed.fasta",
      quiet = T,
      snpOnly=T)
    snps.without_outgroup
        

  2. Convert genlight to geno format (a la dartR) and run snmf
    geno <- as.matrix(snps.without_outgroup)
    geno[is.na(geno)] <- 9
    
    ## Remove constant snps
    #geno <- geno[,apply(geno, 2, function(x) length(unique(x)) > 1)]
    
    outfile <- "4-structure/haplotypes-fixed"
    write.table(
      geno,
      paste(outfile, ".lfmm", sep = ""),
      col.names = FALSE,
      row.names = FALSE,
      sep = " "
    )
    ## write geno
    write.table(
      t(geno),
      paste(outfile, ".geno", sep = ""),
      col.names = FALSE,
      row.names = FALSE,
      sep = ""
    )
    
    ## project <- snmf(paste0(outfile, ".geno"),
    ##                 K = 1:15,
    ##                 entropy = T,
    ##                 repetitions = 10,
    ##                 project = "new",
    ##                 ploidy=1,
    ##                 CPU = 4)
    
    project <- load.snmfProject(paste0(outfile, ".snmfProject"))
        
  3. PCA scree plot
    pca.scree.plot <- pca(paste0(outfile, ".lfmm"), scale=T, K=15) %>%
      tracy.widom %>%
      ggplot(aes(N, percentage)) +
      geom_line() +
      geom_point() +
      scale_y_continuous(labels=scales::label_percent()) +
      labs(title = "PCA Scree Plot",
           x = "Principal Components",
           y = "Percentage of Variance") +
      theme_minimal()
    pca.scree.plot
        

KruncrossEntropy
180.20777
280.13652
380.08029
480.08253
580.07747
680.08881
7100.13266
8100.14189
950.17276
10100.16388
1120.17915
1280.23712
1370.20993
1420.18649
1580.18041
  1. Assign memberships for best run of each K
    admix.coefs <- mapply(Q, K=best.runs$K, run=best.runs$run,
                          MoreArgs = list(project)) %>%
      lapply(as.data.frame) %>%
      lapply(rowid_to_column, "ID") %>%
      bind_rows(.id="K") %>%
      gather(-K, -ID, key="pop", value="q") %>%
      filter(!is.na(q)) %>%
      mutate(pop = as.numeric(sub("V", "", pop)),
             K = as.numeric(K)) %>%
      group_by(K, ID) %>%
      reframe(K, ID, pop, q,
              best.pop=pop[which.max(q)],
              best.q = max(q))
    
        
  2. Structure membership graph
    member.data <- filter(admix.coefs, K==4)
    member.data.order <- spread(member.data, pop, q) %>%
                           select(-ID, -K, -best.pop, -best.q) %>%
                           as.matrix %>%
                           dist %>%
                           hclust(method="ave") %>%
                           as.dendrogram %>%
                           order.dendrogram
    haplotypes.names <- row.names(geno)
    pop.labels <- sprintf("mtCR.%s",
                        c("sound", "inner", "outer", "ocean"))
    member.data <-  mutate(member.data,
                         ID = haplotypes.names[ID],
                         pop = factor(pop, c(1, 2, 3, 4),
                                      pop.labels),
                         best.pop = factor(best.pop, c(1, 2, 3, 4),
                                           pop.labels)) %>%
    mutate(ID = factor(ID, haplotypes.names[member.data.order] ))
    plot.structure <- ggplot(member.data, aes(ID, q, fill=pop)) +
    geom_col(width=1) +
    facet_grid(cols=vars(best.pop), scales="free", space="free",
               switch='x') +
    scale_fill_iwanthue() +
    scale_x_discrete(position = "top", expand = c(0,0)) +
    scale_y_continuous(labels=scales::label_percent(), expand = c(0,0))+
    theme_bw()+
    theme( axis.text.x = element_blank(),
          axis.title.x = element_blank(),
      legend.position = 'bottom')
    plot.structure
    
        

anno <- haplotypes.with_outgroup@haplist %>%
  lapply(as.data.frame, nm='ID') %>%
  bind_rows(.id="name") %>%
  left_join(select(meta.counts, ID, Set, Group, count) %>%
  rbind(data.frame(ID = grep("SER",
                     row.names(fasta.with_outgroup.trimmed),
                     value=T),
           Set="New",
           Group="New",
           count = 1)) , by="ID") %>%
  group_by(name, Set, Group) %>%
  summarise(count = sum(count)) %>%
  rename("name"="ID") %>%
  left_join(unique(select(member.data, ID, best.pop))) %>%
  mutate(Set = ifelse(is.na(Set), "2021", Set),
         Group = if_else(is.na(Group), "blue", Group)) %>%
  mutate(
    ID = factor(ID, haplotypes.names[member.data.order] ),
    Group = factor(Group, c('NW.inner', 'E.inner', 'NW.outer',
                            'E.outer', 'NW.Oceanic', 'NE.oceanic',
                            'E.oceanic', 'green', 'blue', "New"))) %>%
  filter(!is.na(ID))

meta.counts %>%
  select(ID, Group, count) %>%
  spread(key=Group, value = count) %>%
  write.table("published-counts.txt")

write.table(anno, "haplotype.published-groups.txt",
            sep="\t", row.names=F)
anno <- read.delim("haplotype.published-groups.txt")

heatmap.plot <- ggplot(anno, aes(ID,Group)) +
  geom_tile(aes(fill=Group), na.rm=T) +
  geom_text(aes(label=count), na.rm=T) +
  facet_grid(rows=vars(Set), cols=vars(best.pop),
             scales="free", space="free") +
  scale_fill_manual( values=c(colors[['2021']],
                              colors[['2017']],
                              New="grey")) +
  scale_x_discrete(expand = c(0,0)) +
  theme_bw() +
  theme(axis.title = element_blank(),
        axis.text.x= element_text(angle = 90, hjust = 1, vjust = 0.5),
        legend.position = 'none')
heatmap.plot
anno %>%
  group_by(Set, Group, best.pop) %>%
  summarise(count=sum(count)) %>%
  ggplot(aes(Group, count)) +
  geom_col(aes(fill=Group)) +
  geom_label(aes(label=count), y=10, fill='white') +
  facet_grid(cols=vars(Set), rows=vars(best.pop),
             scales="free", space="free") +
  scale_fill_manual( values=c(colors[['2021']],
                              colors[['2017']],
                              New="grey")) +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0), limits=c()) +
  coord_cartesian(y=c(0,75)) +
  theme_bw() +
  theme(axis.title = element_blank(),
        axis.text.x= element_text(angle = 25, hjust = 1,
                                  vjust = 1),
        legend.position = 'none')
library(tidyverse)

read.delim("haplotype.published-groups.txt") %>%
  replace_na(list(count=1)) %>%
  mutate(Group = factor(Group, c('NW.inner', 'E.inner', 'NW.outer',
                                 'E.outer', 'NW.Oceanic', 'NE.oceanic',
                                 'E.oceanic', 'green', 'blue', "New")),
         best.pop = factor(best.pop, c("mtCR.sound","mtCR.inner",
                                       "mtCR.outer","mtCR.ocean"))) %>%
  group_by(Set, Group, best.pop) %>%
  summarise(samples=sum(count), haplotypes=n()) %>%
  pivot_wider(id_cols=c("Set", "Group"),
              values_from = c("samples", "haplotypes"),
              names_from = c("best.pop"),
              names_vary = 'slowest',
              values_fill = 0) %>%
  split(., .$Set) %>%
  lapply(as.data.frame)

Phylogenetics tree

## haplotypes.with_outgroup.fasta <-
##   read.FASTA("2-haplotypes.with_outgroup.fixed.fasta")

## tree.data <- haplotypes.with_outgroup.fasta %>%
##   dist.dna(pairwise.deletion = T) %>%
##   nj

## tree.data$node.label <-
##   boot.phylo(tree.data,
##              as.matrix(haplotypes.with_outgroup.fasta),
##              FUN=function(xx) nj(dist.gene(xx,
##                                            pairwise.deletion = T)),
##              B=1000)
## tree.data <- root(tree.data,
##                   outgroup=c("mtCR.out-1", "mtCR.out-2"))

plot <- anno %>%
  mutate(Group = factor(Group, c('NW.inner',
                                 'E.inner',
                                 'NW.outer',
                                 'E.outer',
                                 'NW.Oceanic',
                                 'NE.oceanic',
                                 'E.oceanic',
                                 'green',
                                 'blue',
                                 "New")),
         best.pop = factor(best.pop, c("mtCR.sound",
                                       "mtCR.inner",
                                       "mtCR.outer",
                                       "mtCR.ocean"))) %>%
  ggplot(aes(Group, ID)) +
  geom_tile(aes(fill=Group), na.rm=T) +
  geom_text(aes(label=count), na.rm=T) +
  facet_grid(cols=vars(Set), scales="free", space="free") +
  scale_fill_manual( values=c(colors[['2021']],
                              colors[['2017']],
                              New="grey")) +
  ggtitle("(C) Haplotype Members") +
  theme_bw() +
  theme(axis.title = element_blank(),
        axis.text.y= element_text(),
        axis.text.x= element_text(angle = 25, hjust = 1),
        legend.position = 'none')

plot.structure <- right_join(member.data,
                             data.frame(ID=tree.data$tip.label)) %>%
  ggplot(aes(q, ID, fill=(pop))) +
  geom_col(width=1) +
  scale_fill_iwanthue(breaks=pop.labels, name="") +
#  scale_y_discrete(expand = c(0,0)) +
  scale_x_continuous(labels=scales::label_percent(),
                     breaks = c(0.25, 0.5, 0.75), expand = c(0,0))+
  ggtitle("(B) Population Structure") +
  theme_bw()+
  theme( axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
    legend.position = c(0.48,1.015),
    legend.direction = "horizontal",
    legend.background = element_blank())

tree <- ggtree(tree.data) +
  geom_nodepoint(aes(color=as.numeric(label)), na.rm=T,
                 size=3) +
  geom_tiplab(align=T, as_ylab=T) +
  #geom_text2(aes(label=label, subset=!isTip),
  #           vjust=0.5, hjust=-0.1, size=3) +
  #geom_nodelab(aes(label=node), geom="label", bg="white") +
  #geom_tiplab(aes(label=node), bg="white") +
  ggtitle("(A) Neighboor-Joining Tree") +
  scale_x_continuous(breaks=seq(0.005, 0.05, 0.01)) +
  scale_color_viridis_c(name="Bootstrap Values") +
  guides(color=guide_colorbar(title.position="top")) +
  theme_tree2() +
  theme(legend.position = c(0.25,0.95),
        legend.direction = "horizontal")
tree <- flip(tree, 116, 104)
tree <- flip(tree, 106, 88)
tree <- flip(tree, 117, 132)
tree <- flip(tree, 87, 97)
tree <- flip(tree, 84, 89)
tree <- flip(tree, 82, 90)
#
main.plot <- plot.structure  %>%
  insert_left(tree, width=0.80)%>%
  insert_right(plot, width=0.90)
options("aplot_guides" = "keep")
main.plot
library(pegas)

pegas.haps <- haplotypes::as.DNAbin(as.dna(haplotypes.with_outgroup))
pegas.haps <- pegas.haps[c(-73, -74),]
class(pegas.haps) <- c("haplotype", "DNAbin")
attr(pegas.haps, "index") <- haplotypes.with_outgroup@hapind
attr(pegas.haps, "from") <- "haps"

pegas.haps.split <-
  select(haplotype.membership,
                            "Haplotype"=ID,
                            "Population"=best.pop) %>%
  distinct %>%
  split(., .$Population) %>%
  lapply(pull, "Haplotype") %>%
  lapply(function(x) {
    haps <- pegas.haps[x,]
    class(haps) <- c("haplotype", "DNAbin")
    attr(haps, "index") <- haplotypes.with_outgroup@hapind[x]
    attr(haps, "from") <- "haps"
    haps
  })
pegas.haps.split[['all']] <- pegas.haps


lapply(pegas.haps.split, pegas::hap.div, variance = T) %>%
  lapply(setNames, c('Hap. Diversity', 'var')) %>%
  bind_rows(.id='Population') %>%
  mutate(Population = fct_relevel(factor(Population),
                                  c("all", pop.labels))) %>%
  arrange(Population)
PopulationHap. Diversityvar
all0.808326618318560.00020120894594653
mtCR.sound0.756226923753150.000267128534545299
mtCR.inner0.9338235294117650.00149002681255929
mtCR.outer0.969696969696970.000440854139372
mtCR.ocean0.9779411764705880.000742592387824971
  1. Nucleotide Diversity (pi)
    lapply(pegas.haps.split, pegas::nuc.div, variance = T) %>%
      lapply(setNames, c('pi', 'var')) %>%
      bind_rows(.id='Population') %>%
      mutate(Population = fct_relevel(factor(Population),
                                      c("all", pop.labels))) %>%
      arrange(Population) %>%
      ascii(digits=c(0, 4, 3),
            format=c('s', 'f', 'e'),
            include.rownames = F)
    
        
    Populationpivar
    all0.01133.939e-05
    mtCR.sound0.00376.681e-06
    mtCR.inner0.00712.026e-05
    mtCR.outer0.00812.430e-05
    mtCR.ocean0.00872.818e-05
  1. Population differentiation (Fst)
    pop <- select(haplotype.membership, best.pop, ID) %>%
      unique() %>%
      pull(best.pop, name=ID) %>%
      na.omit()
    
    genind <- haplotypes.with_outgroup %>%
      as.dna %>%
      haplotypes::as.DNAbin(.) %>%
      .[names(pop),] %>%
      DNAbin2genind(., pop=pop)
    strata(genind) <- data.frame(pop)
    
    library(hierfstat)
    all <- wc(genind)
    pair <- pairwise.WCfst(genind) %>%
      as.data.frame %>%
      rownames_to_column("PopA") %>%
      gather(-PopA, key="PopB", value="Fst") %>%
      filter(PopA < PopB) %>%
      mutate(class = cut(Fst, breaks=c(0,0.05,0.15,0.25,1),
                         labels=c("little", "moderate", "great",
                                  "very great")))
    
    pair[which.min(pair$Fst),]
    pair[which.max(pair$Fst),]
    
    rbind(c('all', 'all', all$FST, NA, use.names = F), pair)
    
    
        

    Vollmer 2017

    gomx.pop <- filter(meta.counts, Set == '2017') %>%
      filter(!grepl("not submitted", ID)) %>%
      select(ID, Group,count) %>%
      arrange(count) %>%
      group_by(ID) %>%
      summarise(Group=Group[1]) %>%
      pull(Group, name=ID)
    
    gomx.genind <-
      fasta.with_outgroup.trimmed.fixed[names(gomx.pop),] %>%
      DNAbin2genind(., pop=gomx.pop)
    strata(gomx.genind) <- data.frame(gomx.pop)
    
    all <- wc(gomx.genind)
    pair <- pairwise.WCfst(gomx.genind) %>%
      as.data.frame %>%
      rownames_to_column("PopA") %>%
      gather(-PopA, key="PopB", value="Fst") %>%
      filter(PopA < PopB) %>%
      mutate(class = cut(Fst, breaks=c(0,0.05,0.15,0.25,1),
                         labels=c("little", "moderate", "great",
                                  "very great"))) %>%
      arrange(Fst)
    
    rbind(c('all', 'all', all$FST, NA, use.names = F), pair)
    
    
        

  2. PhiST
    • Global
      ddp <- as.genclone(genind)
      phistp <- poppr.amova(ddp, ~pop, filter=T, threshold=0.1)
      phistp
              

  • Per Population
    set.seed(45243)
    ## pop[pop == '2'] <- paste0( pop[pop == '2'],
    ##          letters[sample.int(4, size = sum(pop==2), replace = T)])
    pop <- factor(pop)
    table(pop)
    
    ddp <- as.genclone(genind)
    
    phist.subset <- function(pop1, pop2) {
      pops <- popsub(ddp, sublist = c(pop1, pop2), drop=T)
      gc <- as.genclone(pops)
      phistsp <- poppr.amova(gc, ~pop, filter=T, threshold=0.1)
      return(unlist(phistsp$statphi))
    }
    
    phist.table <- expand_grid(pop1=levels(pop),
                  pop2=levels(pop)) %>%
      rowwise() %>%
      mutate(phist = phist.subset(pop1, pop2)) %>%
      ungroup
        

phist.table %>%
  filter(pop1 != pop2) %>%
  group_by(pop1, pop2) %>%
  summarise( phist.all = paste(sprintf("%0.4f", phist),
                               collapse = " "),
            phist=mean(phist), .groups="drop") %>%
  arrange(phist.all) %>%
  ascii(digits=4)
pop1pop2phist.allphist
1mtCR.innermtCR.outer0.42390.4239
2mtCR.outermtCR.inner0.42390.4239
3mtCR.innermtCR.sound0.44020.4402
4mtCR.soundmtCR.inner0.44020.4402
5mtCR.oceanmtCR.outer0.58780.5878
6mtCR.outermtCR.ocean0.58780.5878
7mtCR.outermtCR.sound0.64340.6434
8mtCR.soundmtCR.outer0.64340.6434
9mtCR.innermtCR.ocean0.65340.6534
10mtCR.oceanmtCR.inner0.65340.6534
11mtCR.oceanmtCR.sound0.74420.7442
12mtCR.soundmtCR.ocean0.74420.7442
  • Private alleles
private.alleles <- private_alleles(genind) %>%
  t %>%
  as.data.frame %>%
  rownames_to_column("POS")
ascii(private.alleles, digits=0)

colSums(private.alleles > 0)
POSmtCR.soundmtCR.innermtCR.outermtCR.ocean
123.g0001
226.g1000
332.c00015
433.t0100
564.a0200
673.c2000
7101.g0100
8104.t1000
9106.t1000
10110.c00015
11116.t0100
12117.c0001
13123.c1000
14213.g0100
15216.c0100
16235.c0020
17249.a0010
18252.t0002
19265.c0001
20273.g2000
21274.c0020
22278.c1000
23283.g1000
24291.g0100
25294.c1000
26295.t0010
27306.c0010

POS mtCR.sound mtCR.inner mtCR.outer mtCR.ocean 27 9 7 5 6