added first draft of interactive subcommands

haddocking · Sep 26, 2023 · bedbf25 · bedbf25
1 parent 9af0ea2
commit bedbf25
Show file tree

Hide file tree

Showing 3 changed files with 429 additions and 0 deletions.
diff --git a/src/haddock/re/clustfcc.py b/src/haddock/re/clustfcc.py
@@ -0,0 +1,158 @@
+import argparse
+
+import numpy as np
+import json
+
+from haddock import log
+from pathlib import Path
+
+from haddock.gear.config import load as read_config
+from fcc.scripts import calc_fcc_matrix, cluster_fcc
+
+from haddock.libs.libontology import ModuleIO
+
+from haddock.libs.libclust import write_structure_list
+
+
+def add_clustfcc_arguments(clustfcc_subcommand):
+    clustfcc_subcommand.add_argument(
+    "clustfcc_dir",
+    help="The clustfcc directory to recluster.",
+    )
+
+    clustfcc_subcommand.add_argument(
+        "-f",
+        "--fraction",
+        help="fraction of common contacts to not be considered a singleton model.",
+        required=False,
+        type=float,
+        )
+
+    clustfcc_subcommand.add_argument(
+        "-s",
+        "--strictness",
+        help="fraction of common contacts to be considered to be part of the same cluster.",
+        required=False,
+        type=float,
+        )
+
+    clustfcc_subcommand.add_argument(
+        "-t",
+        "--threshold",
+        help="cluster population threshold.",
+        required=False,
+        type=int,
+        )
+
+    return clustfcc_subcommand
+
+def reclustfcc(clustfcc_dir, fraction=None, strictness=None, threshold=None):
+    """Recluster the models in the clustfcc directory."""
+    log.info(f"Reclustering {clustfcc_dir}")
+
+    # create the interactive folder
+    run_dir = Path(clustfcc_dir).parent
+    clustfcc_name = Path(clustfcc_dir).name
+    outdir = Path(run_dir, f"{clustfcc_name}_interactive")
+    outdir.mkdir(exist_ok=True)
+
+    # create an io object
+    io = ModuleIO()
+    filename = Path(clustfcc_dir, "io.json")
+    io.load(filename)
+    models = io.input
+
+    # load the original clustering parameters via json
+    clustfcc_params = read_config(Path(clustfcc_dir, "params.cfg"))
+    key = list(clustfcc_params['final_cfg'].keys())[0]
+    clustfcc_params = clustfcc_params['final_cfg'][key]
+    log.info(f"Previous clustering parameters: {clustfcc_params}")
+
+    # adjust the parameters
+    if fraction is not None:
+        clustfcc_params["fraction"] = fraction
+    if strictness is not None:
+        clustfcc_params["strictness"] = strictness
+    if threshold is not None:
+        clustfcc_params["threshold"] = threshold
+
+    # load the fcc matrix
+    pool = cluster_fcc.read_matrix(
+            Path(clustfcc_dir, "fcc.matrix"),
+            clustfcc_params['fraction_cutoff'],
+            clustfcc_params['strictness'],
+            )
+
+    cluster_check = False
+    while not cluster_check:
+        for threshold in range(clustfcc_params['threshold'], 0, -1):
+            log.info(f'Clustering with threshold={threshold}')
+            _, clusters = cluster_fcc.cluster_elements(
+                pool,
+                threshold=threshold,
+                )
+            if not clusters:
+                log.info(
+                    "[WARNING] No cluster was found, decreasing threshold!"
+                    )
+            else:
+                cluster_check = True
+                # pass the actual threshold back to the param dict
+                #  because it will be use in the detailed output
+                clustfcc_params['threshold'] = threshold
+                break
+        if not cluster_check:
+            # No cluster was obtained in any threshold
+            cluster_check = True
+
+    # Prepare output and read the elements
+    clt_dic = {}
+    if clusters:
+        # write the classic output file for compatibility reasons
+        log.info('Saving output to cluster.out')
+        cluster_out = Path('cluster.out')
+        with open(cluster_out, 'w') as fh:
+            cluster_fcc.output_clusters(fh, clusters)
+        fh.close()
+        clt_centers = {}
+        for clt in clusters:
+            cluster_id = clt.name
+            cluster_center_id = clt.center.name - 1
+            cluster_center_pdb = models[cluster_center_id]
+            clt_dic[cluster_id] = []
+            clt_centers[cluster_id] = cluster_center_pdb
+            clt_dic[cluster_id].append(cluster_center_pdb)
+            for model in clt.members:
+                model_id = model.name
+                model_pdb = models[model_id - 1]
+                clt_dic[cluster_id].append(model_pdb)
+        # Rank the clusters
+        #  they are sorted by the topX (threshold) models in each cluster
+        score_dic = {}
+        for clt_id in clt_dic:
+            score_l = [p.score for p in clt_dic[clt_id]]
+            score_l.sort()
+            denom = float(min(threshold, len(score_l)))
+            top4_score = sum(score_l[:threshold]) / denom
+            score_dic[clt_id] = top4_score
+        sorted_score_dic = sorted(score_dic.items(), key=lambda k: k[1])
+        # Add this info to the models
+        output_models = []
+        for cluster_rank, _e in enumerate(sorted_score_dic, start=1):
+            cluster_id, _ = _e
+            # sort the models by score
+            clt_dic[cluster_id].sort()
+            # rank the models
+            for model_ranking, pdb in enumerate(clt_dic[cluster_id],
+                                                start=1):
+                pdb.clt_id = cluster_id
+                pdb.clt_rank = cluster_rank
+                pdb.clt_model_rank = model_ranking
+                output_models.append(pdb)
+        # Write unclustered structures
+        write_structure_list(models,
+                             output_models,
+                             out_fname=Path(outdir,"clustfcc.tsv"))
+
+    return outdir
+
diff --git a/src/haddock/re/clustrmsd.py b/src/haddock/re/clustrmsd.py
@@ -0,0 +1,122 @@
+import argparse
+import numpy as np
+import json
+
+from haddock import log
+from pathlib import Path
+
+from haddock.gear.config import load as read_config
+
+from haddock.libs.libontology import ModuleIO
+
+from haddock.modules.analysis.clustrmsd.clustrmsd import get_clusters, rank_clusters, write_clusters
+from haddock.libs.libclust import write_structure_list
+
+
+
+def add_clustrmsd_arguments(clustrmsd_subcommand):
+    clustrmsd_subcommand.add_argument(
+        "clustrmsd_dir",
+        help="The clustrmsd directory to recluster.",
+        )
+
+    clustrmsd_subcommand.add_argument(
+        "-n",
+        "--n_clusters",
+        help="number of clusters to generate.",
+        required=False,
+        type=int,
+        )
+
+    clustrmsd_subcommand.add_argument(
+        "-d",
+        "--distance",
+        help="cutoff distance.",
+        required=False,
+        type=int,
+        )
+
+    clustrmsd_subcommand.add_argument(
+        "-t",
+        "--threshold",
+        help="cluster population threshold.",
+        required=False,
+        type=int,
+        )
+
+    return clustrmsd_subcommand
+
+def reclustrmsd(clustrmsd_dir, n_clusters=None, distance=None, threshold=None):
+    """Recluster the models in the clustrmsd directory."""
+    log.info(f"Reclustering {clustrmsd_dir}")
+    # load the original clustering parameters via json
+    clustrmsd_params = read_config(Path(clustrmsd_dir, "params.cfg"))
+    key = list(clustrmsd_params['final_cfg'].keys())[0]
+    clustrmsd_params = clustrmsd_params['final_cfg'][key]
+    log.info(f"Previous clustering parameters: {clustrmsd_params}")
+
+    # adjust the parameters
+    if n_clusters is not None:
+        clustrmsd_params["tolerance"] = n_clusters
+        clustrmsd_params["criterion"] = "maxclust"
+    else:
+        if distance is not None:
+            clustrmsd_params["tolerance"] = distance
+            clustrmsd_params["criterion"] = "distance"
+
+    if threshold is not None:
+        clustrmsd_params["threshold"] = threshold
+
+    # load the clustering dendrogram
+    dendrogram = np.loadtxt(Path(clustrmsd_dir, "dendrogram.txt"))
+
+    ## get the clusters
+    cluster_arr = get_clusters(dendrogram, clustrmsd_params["tolerance"], clustrmsd_params["criterion"])
+    log.info(f"clusters {cluster_arr}")
+
+    # we got the clusters, now we need write down (part of) the information available in the clustrmsd directory
+    run_dir = Path(clustrmsd_dir).parent
+    clustrmsd_name = Path(clustrmsd_dir).name
+    # create the interactive folder
+    outdir = Path(run_dir, f"{clustrmsd_name}_interactive")
+    outdir.mkdir(exist_ok=True)
+
+    # create an io object
+    io = ModuleIO()
+    filename = Path(clustrmsd_dir, "io.json")
+    io.load(filename)
+    models = io.retrieve_models()
+
+    # processing the clusters
+    unq_clusters = np.unique(cluster_arr)  # contains -1 (unclustered)
+    clusters = [c for c in unq_clusters if c != -1]
+    log.info(f"clusters = {clusters}")
+
+    clt_dic, cluster_centers = write_clusters(clusters, cluster_arr, models, out_filename=Path(outdir, "cluster.out"), rmsd_matrix = None, centers=False)
+
+    sorted_score_dic = rank_clusters(clt_dic, clustrmsd_params["threshold"])
+
+    # add this to the models
+    output_models = []
+    for cluster_rank, _e in enumerate(sorted_score_dic, start=1):
+        cluster_id, _ = _e
+        # sort the models by score
+        clt_dic[cluster_id].sort()
+        # rank the models
+        for model_ranking, pdb in enumerate(clt_dic[cluster_id],
+                                            start=1):
+            pdb.clt_id = int(cluster_id)
+            pdb.clt_rank = cluster_rank
+            pdb.clt_model_rank = model_ranking
+            output_models.append(pdb)
+
+    write_structure_list(models,
+                         output_models,
+                         out_fname=Path(outdir,"clustrmsd.tsv"))
+    # save the io.json file
+    io.save(outdir)
+
+    return outdir
+
+
+