Skip to content

Commit

Permalink
added first draft of interactive subcommands
Browse files Browse the repository at this point in the history
  • Loading branch information
mgiulini committed Sep 26, 2023
1 parent 9af0ea2 commit bedbf25
Show file tree
Hide file tree
Showing 3 changed files with 429 additions and 0 deletions.
158 changes: 158 additions & 0 deletions src/haddock/re/clustfcc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import argparse

import numpy as np
import json

from haddock import log
from pathlib import Path

from haddock.gear.config import load as read_config
from fcc.scripts import calc_fcc_matrix, cluster_fcc

from haddock.libs.libontology import ModuleIO

from haddock.libs.libclust import write_structure_list


def add_clustfcc_arguments(clustfcc_subcommand):
clustfcc_subcommand.add_argument(
"clustfcc_dir",
help="The clustfcc directory to recluster.",
)

clustfcc_subcommand.add_argument(
"-f",
"--fraction",
help="fraction of common contacts to not be considered a singleton model.",
required=False,
type=float,
)

clustfcc_subcommand.add_argument(
"-s",
"--strictness",
help="fraction of common contacts to be considered to be part of the same cluster.",
required=False,
type=float,
)

clustfcc_subcommand.add_argument(
"-t",
"--threshold",
help="cluster population threshold.",
required=False,
type=int,
)

return clustfcc_subcommand

def reclustfcc(clustfcc_dir, fraction=None, strictness=None, threshold=None):
"""Recluster the models in the clustfcc directory."""
log.info(f"Reclustering {clustfcc_dir}")

# create the interactive folder
run_dir = Path(clustfcc_dir).parent
clustfcc_name = Path(clustfcc_dir).name
outdir = Path(run_dir, f"{clustfcc_name}_interactive")
outdir.mkdir(exist_ok=True)

# create an io object
io = ModuleIO()
filename = Path(clustfcc_dir, "io.json")
io.load(filename)
models = io.input

# load the original clustering parameters via json
clustfcc_params = read_config(Path(clustfcc_dir, "params.cfg"))
key = list(clustfcc_params['final_cfg'].keys())[0]
clustfcc_params = clustfcc_params['final_cfg'][key]
log.info(f"Previous clustering parameters: {clustfcc_params}")

# adjust the parameters
if fraction is not None:
clustfcc_params["fraction"] = fraction
if strictness is not None:
clustfcc_params["strictness"] = strictness
if threshold is not None:
clustfcc_params["threshold"] = threshold

# load the fcc matrix
pool = cluster_fcc.read_matrix(
Path(clustfcc_dir, "fcc.matrix"),
clustfcc_params['fraction_cutoff'],
clustfcc_params['strictness'],
)

cluster_check = False
while not cluster_check:
for threshold in range(clustfcc_params['threshold'], 0, -1):
log.info(f'Clustering with threshold={threshold}')
_, clusters = cluster_fcc.cluster_elements(
pool,
threshold=threshold,
)
if not clusters:
log.info(
"[WARNING] No cluster was found, decreasing threshold!"
)
else:
cluster_check = True
# pass the actual threshold back to the param dict
# because it will be use in the detailed output
clustfcc_params['threshold'] = threshold
break
if not cluster_check:
# No cluster was obtained in any threshold
cluster_check = True

# Prepare output and read the elements
clt_dic = {}
if clusters:
# write the classic output file for compatibility reasons
log.info('Saving output to cluster.out')
cluster_out = Path('cluster.out')
with open(cluster_out, 'w') as fh:
cluster_fcc.output_clusters(fh, clusters)
fh.close()
clt_centers = {}
for clt in clusters:
cluster_id = clt.name
cluster_center_id = clt.center.name - 1
cluster_center_pdb = models[cluster_center_id]
clt_dic[cluster_id] = []
clt_centers[cluster_id] = cluster_center_pdb
clt_dic[cluster_id].append(cluster_center_pdb)
for model in clt.members:
model_id = model.name
model_pdb = models[model_id - 1]
clt_dic[cluster_id].append(model_pdb)
# Rank the clusters
# they are sorted by the topX (threshold) models in each cluster
score_dic = {}
for clt_id in clt_dic:
score_l = [p.score for p in clt_dic[clt_id]]
score_l.sort()
denom = float(min(threshold, len(score_l)))
top4_score = sum(score_l[:threshold]) / denom
score_dic[clt_id] = top4_score
sorted_score_dic = sorted(score_dic.items(), key=lambda k: k[1])
# Add this info to the models
output_models = []
for cluster_rank, _e in enumerate(sorted_score_dic, start=1):
cluster_id, _ = _e
# sort the models by score
clt_dic[cluster_id].sort()
# rank the models
for model_ranking, pdb in enumerate(clt_dic[cluster_id],
start=1):
pdb.clt_id = cluster_id
pdb.clt_rank = cluster_rank
pdb.clt_model_rank = model_ranking
output_models.append(pdb)
# Write unclustered structures
write_structure_list(models,
output_models,
out_fname=Path(outdir,"clustfcc.tsv"))

return outdir

122 changes: 122 additions & 0 deletions src/haddock/re/clustrmsd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import argparse
import numpy as np
import json

from haddock import log
from pathlib import Path

from haddock.gear.config import load as read_config

from haddock.libs.libontology import ModuleIO

from haddock.modules.analysis.clustrmsd.clustrmsd import get_clusters, rank_clusters, write_clusters
from haddock.libs.libclust import write_structure_list



def add_clustrmsd_arguments(clustrmsd_subcommand):
clustrmsd_subcommand.add_argument(
"clustrmsd_dir",
help="The clustrmsd directory to recluster.",
)

clustrmsd_subcommand.add_argument(
"-n",
"--n_clusters",
help="number of clusters to generate.",
required=False,
type=int,
)

clustrmsd_subcommand.add_argument(
"-d",
"--distance",
help="cutoff distance.",
required=False,
type=int,
)

clustrmsd_subcommand.add_argument(
"-t",
"--threshold",
help="cluster population threshold.",
required=False,
type=int,
)

return clustrmsd_subcommand

def reclustrmsd(clustrmsd_dir, n_clusters=None, distance=None, threshold=None):
"""Recluster the models in the clustrmsd directory."""
log.info(f"Reclustering {clustrmsd_dir}")
# load the original clustering parameters via json
clustrmsd_params = read_config(Path(clustrmsd_dir, "params.cfg"))
key = list(clustrmsd_params['final_cfg'].keys())[0]
clustrmsd_params = clustrmsd_params['final_cfg'][key]
log.info(f"Previous clustering parameters: {clustrmsd_params}")

# adjust the parameters
if n_clusters is not None:
clustrmsd_params["tolerance"] = n_clusters
clustrmsd_params["criterion"] = "maxclust"
else:
if distance is not None:
clustrmsd_params["tolerance"] = distance
clustrmsd_params["criterion"] = "distance"

if threshold is not None:
clustrmsd_params["threshold"] = threshold

# load the clustering dendrogram
dendrogram = np.loadtxt(Path(clustrmsd_dir, "dendrogram.txt"))

## get the clusters
cluster_arr = get_clusters(dendrogram, clustrmsd_params["tolerance"], clustrmsd_params["criterion"])
log.info(f"clusters {cluster_arr}")

# we got the clusters, now we need write down (part of) the information available in the clustrmsd directory
run_dir = Path(clustrmsd_dir).parent
clustrmsd_name = Path(clustrmsd_dir).name
# create the interactive folder
outdir = Path(run_dir, f"{clustrmsd_name}_interactive")
outdir.mkdir(exist_ok=True)

# create an io object
io = ModuleIO()
filename = Path(clustrmsd_dir, "io.json")
io.load(filename)
models = io.retrieve_models()

# processing the clusters
unq_clusters = np.unique(cluster_arr) # contains -1 (unclustered)
clusters = [c for c in unq_clusters if c != -1]
log.info(f"clusters = {clusters}")

clt_dic, cluster_centers = write_clusters(clusters, cluster_arr, models, out_filename=Path(outdir, "cluster.out"), rmsd_matrix = None, centers=False)

sorted_score_dic = rank_clusters(clt_dic, clustrmsd_params["threshold"])

# add this to the models
output_models = []
for cluster_rank, _e in enumerate(sorted_score_dic, start=1):
cluster_id, _ = _e
# sort the models by score
clt_dic[cluster_id].sort()
# rank the models
for model_ranking, pdb in enumerate(clt_dic[cluster_id],
start=1):
pdb.clt_id = int(cluster_id)
pdb.clt_rank = cluster_rank
pdb.clt_model_rank = model_ranking
output_models.append(pdb)

write_structure_list(models,
output_models,
out_fname=Path(outdir,"clustrmsd.tsv"))
# save the io.json file
io.save(outdir)

return outdir



Loading

0 comments on commit bedbf25

Please sign in to comment.