Simon Duerr commited on
Commit
f969e9c
1 Parent(s): fb853ff

new ui, preps for af2

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. alphafold/LICENSE +202 -0
  3. alphafold/alphafold/__init__.py +14 -0
  4. alphafold/alphafold/__pycache__/__init__.cpython-36.pyc +0 -0
  5. alphafold/alphafold/__pycache__/__init__.cpython-38.pyc +0 -0
  6. alphafold/alphafold/common/__init__.py +14 -0
  7. alphafold/alphafold/common/__pycache__/__init__.cpython-36.pyc +0 -0
  8. alphafold/alphafold/common/__pycache__/__init__.cpython-38.pyc +0 -0
  9. alphafold/alphafold/common/__pycache__/confidence.cpython-36.pyc +0 -0
  10. alphafold/alphafold/common/__pycache__/confidence.cpython-38.pyc +0 -0
  11. alphafold/alphafold/common/__pycache__/protein.cpython-36.pyc +0 -0
  12. alphafold/alphafold/common/__pycache__/protein.cpython-38.pyc +0 -0
  13. alphafold/alphafold/common/__pycache__/residue_constants.cpython-36.pyc +0 -0
  14. alphafold/alphafold/common/__pycache__/residue_constants.cpython-38.pyc +0 -0
  15. alphafold/alphafold/common/confidence.py +155 -0
  16. alphafold/alphafold/common/protein.py +229 -0
  17. alphafold/alphafold/common/protein_test.py +89 -0
  18. alphafold/alphafold/common/residue_constants.py +895 -0
  19. alphafold/alphafold/common/residue_constants_test.py +190 -0
  20. alphafold/alphafold/common/testdata/2rbg.pdb +0 -0
  21. alphafold/alphafold/data/__init__.py +14 -0
  22. alphafold/alphafold/data/__pycache__/__init__.cpython-36.pyc +0 -0
  23. alphafold/alphafold/data/__pycache__/__init__.cpython-38.pyc +0 -0
  24. alphafold/alphafold/data/__pycache__/mmcif_parsing.cpython-36.pyc +0 -0
  25. alphafold/alphafold/data/__pycache__/mmcif_parsing.cpython-38.pyc +0 -0
  26. alphafold/alphafold/data/__pycache__/parsers.cpython-36.pyc +0 -0
  27. alphafold/alphafold/data/__pycache__/parsers.cpython-38.pyc +0 -0
  28. alphafold/alphafold/data/__pycache__/pipeline.cpython-36.pyc +0 -0
  29. alphafold/alphafold/data/__pycache__/pipeline.cpython-38.pyc +0 -0
  30. alphafold/alphafold/data/__pycache__/templates.cpython-36.pyc +0 -0
  31. alphafold/alphafold/data/__pycache__/templates.cpython-38.pyc +0 -0
  32. alphafold/alphafold/data/mmcif_parsing.py +384 -0
  33. alphafold/alphafold/data/parsers.py +364 -0
  34. alphafold/alphafold/data/pipeline.py +209 -0
  35. alphafold/alphafold/data/templates.py +922 -0
  36. alphafold/alphafold/data/tools/__init__.py +14 -0
  37. alphafold/alphafold/data/tools/__pycache__/__init__.cpython-36.pyc +0 -0
  38. alphafold/alphafold/data/tools/__pycache__/__init__.cpython-38.pyc +0 -0
  39. alphafold/alphafold/data/tools/__pycache__/hhblits.cpython-36.pyc +0 -0
  40. alphafold/alphafold/data/tools/__pycache__/hhblits.cpython-38.pyc +0 -0
  41. alphafold/alphafold/data/tools/__pycache__/hhsearch.cpython-36.pyc +0 -0
  42. alphafold/alphafold/data/tools/__pycache__/hhsearch.cpython-38.pyc +0 -0
  43. alphafold/alphafold/data/tools/__pycache__/jackhmmer.cpython-36.pyc +0 -0
  44. alphafold/alphafold/data/tools/__pycache__/jackhmmer.cpython-38.pyc +0 -0
  45. alphafold/alphafold/data/tools/__pycache__/kalign.cpython-36.pyc +0 -0
  46. alphafold/alphafold/data/tools/__pycache__/kalign.cpython-38.pyc +0 -0
  47. alphafold/alphafold/data/tools/__pycache__/utils.cpython-36.pyc +0 -0
  48. alphafold/alphafold/data/tools/__pycache__/utils.cpython-38.pyc +0 -0
  49. alphafold/alphafold/data/tools/hhblits.py +155 -0
  50. alphafold/alphafold/data/tools/hhsearch.py +91 -0
.gitattributes CHANGED
@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ npz filter=lfs diff=lfs merge=lfs -text
29
+ *.npz filter=lfs diff=lfs merge=lfs -text
alphafold/LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright [yyyy] [name of copyright owner]
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
alphafold/alphafold/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """An implementation of the inference pipeline of AlphaFold v2.0."""
alphafold/alphafold/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (215 Bytes). View file
 
alphafold/alphafold/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (232 Bytes). View file
 
alphafold/alphafold/common/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Common data types and constants used within Alphafold."""
alphafold/alphafold/common/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (214 Bytes). View file
 
alphafold/alphafold/common/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (231 Bytes). View file
 
alphafold/alphafold/common/__pycache__/confidence.cpython-36.pyc ADDED
Binary file (4.07 kB). View file
 
alphafold/alphafold/common/__pycache__/confidence.cpython-38.pyc ADDED
Binary file (4.08 kB). View file
 
alphafold/alphafold/common/__pycache__/protein.cpython-36.pyc ADDED
Binary file (5.41 kB). View file
 
alphafold/alphafold/common/__pycache__/protein.cpython-38.pyc ADDED
Binary file (5.48 kB). View file
 
alphafold/alphafold/common/__pycache__/residue_constants.cpython-36.pyc ADDED
Binary file (23.9 kB). View file
 
alphafold/alphafold/common/__pycache__/residue_constants.cpython-38.pyc ADDED
Binary file (20.2 kB). View file
 
alphafold/alphafold/common/confidence.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Functions for processing confidence metrics."""
16
+
17
+ from typing import Dict, Optional, Tuple
18
+ import numpy as np
19
+ import scipy.special
20
+
21
+
22
+ def compute_plddt(logits: np.ndarray) -> np.ndarray:
23
+ """Computes per-residue pLDDT from logits.
24
+
25
+ Args:
26
+ logits: [num_res, num_bins] output from the PredictedLDDTHead.
27
+
28
+ Returns:
29
+ plddt: [num_res] per-residue pLDDT.
30
+ """
31
+ num_bins = logits.shape[-1]
32
+ bin_width = 1.0 / num_bins
33
+ bin_centers = np.arange(start=0.5 * bin_width, stop=1.0, step=bin_width)
34
+ probs = scipy.special.softmax(logits, axis=-1)
35
+ predicted_lddt_ca = np.sum(probs * bin_centers[None, :], axis=-1)
36
+ return predicted_lddt_ca * 100
37
+
38
+
39
+ def _calculate_bin_centers(breaks: np.ndarray):
40
+ """Gets the bin centers from the bin edges.
41
+
42
+ Args:
43
+ breaks: [num_bins - 1] the error bin edges.
44
+
45
+ Returns:
46
+ bin_centers: [num_bins] the error bin centers.
47
+ """
48
+ step = (breaks[1] - breaks[0])
49
+
50
+ # Add half-step to get the center
51
+ bin_centers = breaks + step / 2
52
+ # Add a catch-all bin at the end.
53
+ bin_centers = np.concatenate([bin_centers, [bin_centers[-1] + step]],
54
+ axis=0)
55
+ return bin_centers
56
+
57
+
58
+ def _calculate_expected_aligned_error(
59
+ alignment_confidence_breaks: np.ndarray,
60
+ aligned_distance_error_probs: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
61
+ """Calculates expected aligned distance errors for every pair of residues.
62
+
63
+ Args:
64
+ alignment_confidence_breaks: [num_bins - 1] the error bin edges.
65
+ aligned_distance_error_probs: [num_res, num_res, num_bins] the predicted
66
+ probs for each error bin, for each pair of residues.
67
+
68
+ Returns:
69
+ predicted_aligned_error: [num_res, num_res] the expected aligned distance
70
+ error for each pair of residues.
71
+ max_predicted_aligned_error: The maximum predicted error possible.
72
+ """
73
+ bin_centers = _calculate_bin_centers(alignment_confidence_breaks)
74
+
75
+ # Tuple of expected aligned distance error and max possible error.
76
+ return (np.sum(aligned_distance_error_probs * bin_centers, axis=-1),
77
+ np.asarray(bin_centers[-1]))
78
+
79
+
80
+ def compute_predicted_aligned_error(
81
+ logits: np.ndarray,
82
+ breaks: np.ndarray) -> Dict[str, np.ndarray]:
83
+ """Computes aligned confidence metrics from logits.
84
+
85
+ Args:
86
+ logits: [num_res, num_res, num_bins] the logits output from
87
+ PredictedAlignedErrorHead.
88
+ breaks: [num_bins - 1] the error bin edges.
89
+
90
+ Returns:
91
+ aligned_confidence_probs: [num_res, num_res, num_bins] the predicted
92
+ aligned error probabilities over bins for each residue pair.
93
+ predicted_aligned_error: [num_res, num_res] the expected aligned distance
94
+ error for each pair of residues.
95
+ max_predicted_aligned_error: The maximum predicted error possible.
96
+ """
97
+ aligned_confidence_probs = scipy.special.softmax(
98
+ logits,
99
+ axis=-1)
100
+ predicted_aligned_error, max_predicted_aligned_error = (
101
+ _calculate_expected_aligned_error(
102
+ alignment_confidence_breaks=breaks,
103
+ aligned_distance_error_probs=aligned_confidence_probs))
104
+ return {
105
+ 'aligned_confidence_probs': aligned_confidence_probs,
106
+ 'predicted_aligned_error': predicted_aligned_error,
107
+ 'max_predicted_aligned_error': max_predicted_aligned_error,
108
+ }
109
+
110
+
111
+ def predicted_tm_score(
112
+ logits: np.ndarray,
113
+ breaks: np.ndarray,
114
+ residue_weights: Optional[np.ndarray] = None) -> np.ndarray:
115
+ """Computes predicted TM alignment score.
116
+
117
+ Args:
118
+ logits: [num_res, num_res, num_bins] the logits output from
119
+ PredictedAlignedErrorHead.
120
+ breaks: [num_bins] the error bins.
121
+ residue_weights: [num_res] the per residue weights to use for the
122
+ expectation.
123
+
124
+ Returns:
125
+ ptm_score: the predicted TM alignment score.
126
+ """
127
+
128
+ # residue_weights has to be in [0, 1], but can be floating-point, i.e. the
129
+ # exp. resolved head's probability.
130
+ if residue_weights is None:
131
+ residue_weights = np.ones(logits.shape[0])
132
+
133
+ bin_centers = _calculate_bin_centers(breaks)
134
+
135
+ num_res = np.sum(residue_weights)
136
+ # Clip num_res to avoid negative/undefined d0.
137
+ clipped_num_res = max(num_res, 19)
138
+
139
+ # Compute d_0(num_res) as defined by TM-score, eqn. (5) in
140
+ # http://zhanglab.ccmb.med.umich.edu/papers/2004_3.pdf
141
+ # Yang & Skolnick "Scoring function for automated
142
+ # assessment of protein structure template quality" 2004
143
+ d0 = 1.24 * (clipped_num_res - 15) ** (1./3) - 1.8
144
+
145
+ # Convert logits to probs
146
+ probs = scipy.special.softmax(logits, axis=-1)
147
+
148
+ # TM-Score term for every bin
149
+ tm_per_bin = 1. / (1 + np.square(bin_centers) / np.square(d0))
150
+ # E_distances tm(distance)
151
+ predicted_tm_term = np.sum(probs * tm_per_bin, axis=-1)
152
+
153
+ normed_residue_mask = residue_weights / (1e-8 + residue_weights.sum())
154
+ per_alignment = np.sum(predicted_tm_term * normed_residue_mask, axis=-1)
155
+ return np.asarray(per_alignment[(per_alignment * residue_weights).argmax()])
alphafold/alphafold/common/protein.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Protein data type."""
16
+ import dataclasses
17
+ import io
18
+ from typing import Any, Mapping, Optional
19
+ from alphafold.common import residue_constants
20
+ from Bio.PDB import PDBParser
21
+ import numpy as np
22
+
23
+ FeatureDict = Mapping[str, np.ndarray]
24
+ ModelOutput = Mapping[str, Any] # Is a nested dict.
25
+
26
+
27
+ @dataclasses.dataclass(frozen=True)
28
+ class Protein:
29
+ """Protein structure representation."""
30
+
31
+ # Cartesian coordinates of atoms in angstroms. The atom types correspond to
32
+ # residue_constants.atom_types, i.e. the first three are N, CA, CB.
33
+ atom_positions: np.ndarray # [num_res, num_atom_type, 3]
34
+
35
+ # Amino-acid type for each residue represented as an integer between 0 and
36
+ # 20, where 20 is 'X'.
37
+ aatype: np.ndarray # [num_res]
38
+
39
+ # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
40
+ # is present and 0.0 if not. This should be used for loss masking.
41
+ atom_mask: np.ndarray # [num_res, num_atom_type]
42
+
43
+ # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
44
+ residue_index: np.ndarray # [num_res]
45
+
46
+ # B-factors, or temperature factors, of each residue (in sq. angstroms units),
47
+ # representing the displacement of the residue from its ground truth mean
48
+ # value.
49
+ b_factors: np.ndarray # [num_res, num_atom_type]
50
+
51
+
52
+ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
53
+ """Takes a PDB string and constructs a Protein object.
54
+
55
+ WARNING: All non-standard residue types will be converted into UNK. All
56
+ non-standard atoms will be ignored.
57
+
58
+ Args:
59
+ pdb_str: The contents of the pdb file
60
+ chain_id: If None, then the pdb file must contain a single chain (which
61
+ will be parsed). If chain_id is specified (e.g. A), then only that chain
62
+ is parsed.
63
+
64
+ Returns:
65
+ A new `Protein` parsed from the pdb contents.
66
+ """
67
+ pdb_fh = io.StringIO(pdb_str)
68
+ parser = PDBParser(QUIET=True)
69
+ structure = parser.get_structure('none', pdb_fh)
70
+ models = list(structure.get_models())
71
+ if len(models) != 1:
72
+ raise ValueError(
73
+ f'Only single model PDBs are supported. Found {len(models)} models.')
74
+ model = models[0]
75
+
76
+ if chain_id is not None:
77
+ chain = model[chain_id]
78
+ else:
79
+ chains = list(model.get_chains())
80
+ if len(chains) != 1:
81
+ raise ValueError(
82
+ 'Only single chain PDBs are supported when chain_id not specified. '
83
+ f'Found {len(chains)} chains.')
84
+ else:
85
+ chain = chains[0]
86
+
87
+ atom_positions = []
88
+ aatype = []
89
+ atom_mask = []
90
+ residue_index = []
91
+ b_factors = []
92
+
93
+ for res in chain:
94
+ if res.id[2] != ' ':
95
+ raise ValueError(
96
+ f'PDB contains an insertion code at chain {chain.id} and residue '
97
+ f'index {res.id[1]}. These are not supported.')
98
+ res_shortname = residue_constants.restype_3to1.get(res.resname, 'X')
99
+ restype_idx = residue_constants.restype_order.get(
100
+ res_shortname, residue_constants.restype_num)
101
+ pos = np.zeros((residue_constants.atom_type_num, 3))
102
+ mask = np.zeros((residue_constants.atom_type_num,))
103
+ res_b_factors = np.zeros((residue_constants.atom_type_num,))
104
+ for atom in res:
105
+ if atom.name not in residue_constants.atom_types:
106
+ continue
107
+ pos[residue_constants.atom_order[atom.name]] = atom.coord
108
+ mask[residue_constants.atom_order[atom.name]] = 1.
109
+ res_b_factors[residue_constants.atom_order[atom.name]] = atom.bfactor
110
+ if np.sum(mask) < 0.5:
111
+ # If no known atom positions are reported for the residue then skip it.
112
+ continue
113
+ aatype.append(restype_idx)
114
+ atom_positions.append(pos)
115
+ atom_mask.append(mask)
116
+ residue_index.append(res.id[1])
117
+ b_factors.append(res_b_factors)
118
+
119
+ return Protein(
120
+ atom_positions=np.array(atom_positions),
121
+ atom_mask=np.array(atom_mask),
122
+ aatype=np.array(aatype),
123
+ residue_index=np.array(residue_index),
124
+ b_factors=np.array(b_factors))
125
+
126
+
127
+ def to_pdb(prot: Protein) -> str:
128
+ """Converts a `Protein` instance to a PDB string.
129
+
130
+ Args:
131
+ prot: The protein to convert to PDB.
132
+
133
+ Returns:
134
+ PDB string.
135
+ """
136
+ restypes = residue_constants.restypes + ['X']
137
+ res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], 'UNK')
138
+ atom_types = residue_constants.atom_types
139
+
140
+ pdb_lines = []
141
+
142
+ atom_mask = prot.atom_mask
143
+ aatype = prot.aatype
144
+ atom_positions = prot.atom_positions
145
+ residue_index = prot.residue_index.astype(np.int32)
146
+ b_factors = prot.b_factors
147
+
148
+ if np.any(aatype > residue_constants.restype_num):
149
+ raise ValueError('Invalid aatypes.')
150
+
151
+ pdb_lines.append('MODEL 1')
152
+ atom_index = 1
153
+ chain_id = 'A'
154
+ # Add all atom sites.
155
+ for i in range(aatype.shape[0]):
156
+ res_name_3 = res_1to3(aatype[i])
157
+ for atom_name, pos, mask, b_factor in zip(
158
+ atom_types, atom_positions[i], atom_mask[i], b_factors[i]):
159
+ if mask < 0.5:
160
+ continue
161
+
162
+ record_type = 'ATOM'
163
+ name = atom_name if len(atom_name) == 4 else f' {atom_name}'
164
+ alt_loc = ''
165
+ insertion_code = ''
166
+ occupancy = 1.00
167
+ element = atom_name[0] # Protein supports only C, N, O, S, this works.
168
+ charge = ''
169
+ # PDB is a columnar format, every space matters here!
170
+ atom_line = (f'{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}'
171
+ f'{res_name_3:>3} {chain_id:>1}'
172
+ f'{residue_index[i]:>4}{insertion_code:>1} '
173
+ f'{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}'
174
+ f'{occupancy:>6.2f}{b_factor:>6.2f} '
175
+ f'{element:>2}{charge:>2}')
176
+ pdb_lines.append(atom_line)
177
+ atom_index += 1
178
+
179
+ # Close the chain.
180
+ chain_end = 'TER'
181
+ chain_termination_line = (
182
+ f'{chain_end:<6}{atom_index:>5} {res_1to3(aatype[-1]):>3} '
183
+ f'{chain_id:>1}{residue_index[-1]:>4}')
184
+ pdb_lines.append(chain_termination_line)
185
+ pdb_lines.append('ENDMDL')
186
+
187
+ pdb_lines.append('END')
188
+ pdb_lines.append('')
189
+ return '\n'.join(pdb_lines)
190
+
191
+
192
+ def ideal_atom_mask(prot: Protein) -> np.ndarray:
193
+ """Computes an ideal atom mask.
194
+
195
+ `Protein.atom_mask` typically is defined according to the atoms that are
196
+ reported in the PDB. This function computes a mask according to heavy atoms
197
+ that should be present in the given sequence of amino acids.
198
+
199
+ Args:
200
+ prot: `Protein` whose fields are `numpy.ndarray` objects.
201
+
202
+ Returns:
203
+ An ideal atom mask.
204
+ """
205
+ return residue_constants.STANDARD_ATOM_MASK[prot.aatype]
206
+
207
+
208
+ def from_prediction(features: FeatureDict, result: ModelOutput,
209
+ b_factors: Optional[np.ndarray] = None) -> Protein:
210
+ """Assembles a protein from a prediction.
211
+
212
+ Args:
213
+ features: Dictionary holding model inputs.
214
+ result: Dictionary holding model outputs.
215
+ b_factors: (Optional) B-factors to use for the protein.
216
+
217
+ Returns:
218
+ A protein instance.
219
+ """
220
+ fold_output = result['structure_module']
221
+ if b_factors is None:
222
+ b_factors = np.zeros_like(fold_output['final_atom_mask'])
223
+
224
+ return Protein(
225
+ aatype=features['aatype'][0],
226
+ atom_positions=fold_output['final_atom_positions'],
227
+ atom_mask=fold_output['final_atom_mask'],
228
+ residue_index=features['residue_index'][0] + 1,
229
+ b_factors=b_factors)
alphafold/alphafold/common/protein_test.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Tests for protein."""
16
+
17
+ import os
18
+
19
+ from absl.testing import absltest
20
+ from absl.testing import parameterized
21
+ from alphafold.common import protein
22
+ from alphafold.common import residue_constants
23
+ import numpy as np
24
+ # Internal import (7716).
25
+
26
+ TEST_DATA_DIR = 'alphafold/common/testdata/'
27
+
28
+
29
+ class ProteinTest(parameterized.TestCase):
30
+
31
+ def _check_shapes(self, prot, num_res):
32
+ """Check that the processed shapes are correct."""
33
+ num_atoms = residue_constants.atom_type_num
34
+ self.assertEqual((num_res, num_atoms, 3), prot.atom_positions.shape)
35
+ self.assertEqual((num_res,), prot.aatype.shape)
36
+ self.assertEqual((num_res, num_atoms), prot.atom_mask.shape)
37
+ self.assertEqual((num_res,), prot.residue_index.shape)
38
+ self.assertEqual((num_res, num_atoms), prot.b_factors.shape)
39
+
40
+ @parameterized.parameters(('2rbg.pdb', 'A', 282),
41
+ ('2rbg.pdb', 'B', 282))
42
+ def test_from_pdb_str(self, pdb_file, chain_id, num_res):
43
+ pdb_file = os.path.join(absltest.get_default_test_srcdir(), TEST_DATA_DIR,
44
+ pdb_file)
45
+ with open(pdb_file) as f:
46
+ pdb_string = f.read()
47
+ prot = protein.from_pdb_string(pdb_string, chain_id)
48
+ self._check_shapes(prot, num_res)
49
+ self.assertGreaterEqual(prot.aatype.min(), 0)
50
+ # Allow equal since unknown restypes have index equal to restype_num.
51
+ self.assertLessEqual(prot.aatype.max(), residue_constants.restype_num)
52
+
53
+ def test_to_pdb(self):
54
+ with open(
55
+ os.path.join(absltest.get_default_test_srcdir(), TEST_DATA_DIR,
56
+ '2rbg.pdb')) as f:
57
+ pdb_string = f.read()
58
+ prot = protein.from_pdb_string(pdb_string, chain_id='A')
59
+ pdb_string_reconstr = protein.to_pdb(prot)
60
+ prot_reconstr = protein.from_pdb_string(pdb_string_reconstr)
61
+
62
+ np.testing.assert_array_equal(prot_reconstr.aatype, prot.aatype)
63
+ np.testing.assert_array_almost_equal(
64
+ prot_reconstr.atom_positions, prot.atom_positions)
65
+ np.testing.assert_array_almost_equal(
66
+ prot_reconstr.atom_mask, prot.atom_mask)
67
+ np.testing.assert_array_equal(
68
+ prot_reconstr.residue_index, prot.residue_index)
69
+ np.testing.assert_array_almost_equal(
70
+ prot_reconstr.b_factors, prot.b_factors)
71
+
72
+ def test_ideal_atom_mask(self):
73
+ with open(
74
+ os.path.join(absltest.get_default_test_srcdir(), TEST_DATA_DIR,
75
+ '2rbg.pdb')) as f:
76
+ pdb_string = f.read()
77
+ prot = protein.from_pdb_string(pdb_string, chain_id='A')
78
+ ideal_mask = protein.ideal_atom_mask(prot)
79
+ non_ideal_residues = set([102] + list(range(127, 285)))
80
+ for i, (res, atom_mask) in enumerate(
81
+ zip(prot.residue_index, prot.atom_mask)):
82
+ if res in non_ideal_residues:
83
+ self.assertFalse(np.all(atom_mask == ideal_mask[i]), msg=f'{res}')
84
+ else:
85
+ self.assertTrue(np.all(atom_mask == ideal_mask[i]), msg=f'{res}')
86
+
87
+
88
+ if __name__ == '__main__':
89
+ absltest.main()
alphafold/alphafold/common/residue_constants.py ADDED
@@ -0,0 +1,895 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Constants used in AlphaFold."""
16
+
17
+ import collections
18
+ import functools
19
+ from typing import List, Mapping, Tuple
20
+
21
+ import numpy as np
22
+ import tree
23
+
24
+ # Internal import (35fd).
25
+
26
+
27
+ # Distance from one CA to next CA [trans configuration: omega = 180].
28
+ ca_ca = 3.80209737096
29
+
30
+ # Format: The list for each AA type contains chi1, chi2, chi3, chi4 in
31
+ # this order (or a relevant subset from chi1 onwards). ALA and GLY don't have
32
+ # chi angles so their chi angle lists are empty.
33
+ chi_angles_atoms = {
34
+ 'ALA': [],
35
+ # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
36
+ 'ARG': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
37
+ ['CB', 'CG', 'CD', 'NE'], ['CG', 'CD', 'NE', 'CZ']],
38
+ 'ASN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
39
+ 'ASP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
40
+ 'CYS': [['N', 'CA', 'CB', 'SG']],
41
+ 'GLN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
42
+ ['CB', 'CG', 'CD', 'OE1']],
43
+ 'GLU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
44
+ ['CB', 'CG', 'CD', 'OE1']],
45
+ 'GLY': [],
46
+ 'HIS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'ND1']],
47
+ 'ILE': [['N', 'CA', 'CB', 'CG1'], ['CA', 'CB', 'CG1', 'CD1']],
48
+ 'LEU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
49
+ 'LYS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
50
+ ['CB', 'CG', 'CD', 'CE'], ['CG', 'CD', 'CE', 'NZ']],
51
+ 'MET': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'SD'],
52
+ ['CB', 'CG', 'SD', 'CE']],
53
+ 'PHE': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
54
+ 'PRO': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD']],
55
+ 'SER': [['N', 'CA', 'CB', 'OG']],
56
+ 'THR': [['N', 'CA', 'CB', 'OG1']],
57
+ 'TRP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
58
+ 'TYR': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
59
+ 'VAL': [['N', 'CA', 'CB', 'CG1']],
60
+ }
61
+
62
+ # If chi angles given in fixed-length array, this matrix determines how to mask
63
+ # them for each AA type. The order is as per restype_order (see below).
64
+ chi_angles_mask = [
65
+ [0.0, 0.0, 0.0, 0.0], # ALA
66
+ [1.0, 1.0, 1.0, 1.0], # ARG
67
+ [1.0, 1.0, 0.0, 0.0], # ASN
68
+ [1.0, 1.0, 0.0, 0.0], # ASP
69
+ [1.0, 0.0, 0.0, 0.0], # CYS
70
+ [1.0, 1.0, 1.0, 0.0], # GLN
71
+ [1.0, 1.0, 1.0, 0.0], # GLU
72
+ [0.0, 0.0, 0.0, 0.0], # GLY
73
+ [1.0, 1.0, 0.0, 0.0], # HIS
74
+ [1.0, 1.0, 0.0, 0.0], # ILE
75
+ [1.0, 1.0, 0.0, 0.0], # LEU
76
+ [1.0, 1.0, 1.0, 1.0], # LYS
77
+ [1.0, 1.0, 1.0, 0.0], # MET
78
+ [1.0, 1.0, 0.0, 0.0], # PHE
79
+ [1.0, 1.0, 0.0, 0.0], # PRO
80
+ [1.0, 0.0, 0.0, 0.0], # SER
81
+ [1.0, 0.0, 0.0, 0.0], # THR
82
+ [1.0, 1.0, 0.0, 0.0], # TRP
83
+ [1.0, 1.0, 0.0, 0.0], # TYR
84
+ [1.0, 0.0, 0.0, 0.0], # VAL
85
+ ]
86
+
87
+ # The following chi angles are pi periodic: they can be rotated by a multiple
88
+ # of pi without affecting the structure.
89
+ chi_pi_periodic = [
90
+ [0.0, 0.0, 0.0, 0.0], # ALA
91
+ [0.0, 0.0, 0.0, 0.0], # ARG
92
+ [0.0, 0.0, 0.0, 0.0], # ASN
93
+ [0.0, 1.0, 0.0, 0.0], # ASP
94
+ [0.0, 0.0, 0.0, 0.0], # CYS
95
+ [0.0, 0.0, 0.0, 0.0], # GLN
96
+ [0.0, 0.0, 1.0, 0.0], # GLU
97
+ [0.0, 0.0, 0.0, 0.0], # GLY
98
+ [0.0, 0.0, 0.0, 0.0], # HIS
99
+ [0.0, 0.0, 0.0, 0.0], # ILE
100
+ [0.0, 0.0, 0.0, 0.0], # LEU
101
+ [0.0, 0.0, 0.0, 0.0], # LYS
102
+ [0.0, 0.0, 0.0, 0.0], # MET
103
+ [0.0, 1.0, 0.0, 0.0], # PHE
104
+ [0.0, 0.0, 0.0, 0.0], # PRO
105
+ [0.0, 0.0, 0.0, 0.0], # SER
106
+ [0.0, 0.0, 0.0, 0.0], # THR
107
+ [0.0, 0.0, 0.0, 0.0], # TRP
108
+ [0.0, 1.0, 0.0, 0.0], # TYR
109
+ [0.0, 0.0, 0.0, 0.0], # VAL
110
+ [0.0, 0.0, 0.0, 0.0], # UNK
111
+ ]
112
+
113
+ # Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,
114
+ # psi and chi angles:
115
+ # 0: 'backbone group',
116
+ # 1: 'pre-omega-group', (empty)
117
+ # 2: 'phi-group', (currently empty, because it defines only hydrogens)
118
+ # 3: 'psi-group',
119
+ # 4,5,6,7: 'chi1,2,3,4-group'
120
+ # The atom positions are relative to the axis-end-atom of the corresponding
121
+ # rotation axis. The x-axis is in direction of the rotation axis, and the y-axis
122
+ # is defined such that the dihedral-angle-definiting atom (the last entry in
123
+ # chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).
124
+ # format: [atomname, group_idx, rel_position]
125
+ rigid_group_atom_positions = {
126
+ 'ALA': [
127
+ ['N', 0, (-0.525, 1.363, 0.000)],
128
+ ['CA', 0, (0.000, 0.000, 0.000)],
129
+ ['C', 0, (1.526, -0.000, -0.000)],
130
+ ['CB', 0, (-0.529, -0.774, -1.205)],
131
+ ['O', 3, (0.627, 1.062, 0.000)],
132
+ ],
133
+ 'ARG': [
134
+ ['N', 0, (-0.524, 1.362, -0.000)],
135
+ ['CA', 0, (0.000, 0.000, 0.000)],
136
+ ['C', 0, (1.525, -0.000, -0.000)],
137
+ ['CB', 0, (-0.524, -0.778, -1.209)],
138
+ ['O', 3, (0.626, 1.062, 0.000)],
139
+ ['CG', 4, (0.616, 1.390, -0.000)],
140
+ ['CD', 5, (0.564, 1.414, 0.000)],
141
+ ['NE', 6, (0.539, 1.357, -0.000)],
142
+ ['NH1', 7, (0.206, 2.301, 0.000)],
143
+ ['NH2', 7, (2.078, 0.978, -0.000)],
144
+ ['CZ', 7, (0.758, 1.093, -0.000)],
145
+ ],
146
+ 'ASN': [
147
+ ['N', 0, (-0.536, 1.357, 0.000)],
148
+ ['CA', 0, (0.000, 0.000, 0.000)],
149
+ ['C', 0, (1.526, -0.000, -0.000)],
150
+ ['CB', 0, (-0.531, -0.787, -1.200)],
151
+ ['O', 3, (0.625, 1.062, 0.000)],
152
+ ['CG', 4, (0.584, 1.399, 0.000)],
153
+ ['ND2', 5, (0.593, -1.188, 0.001)],
154
+ ['OD1', 5, (0.633, 1.059, 0.000)],
155
+ ],
156
+ 'ASP': [
157
+ ['N', 0, (-0.525, 1.362, -0.000)],
158
+ ['CA', 0, (0.000, 0.000, 0.000)],
159
+ ['C', 0, (1.527, 0.000, -0.000)],
160
+ ['CB', 0, (-0.526, -0.778, -1.208)],
161
+ ['O', 3, (0.626, 1.062, -0.000)],
162
+ ['CG', 4, (0.593, 1.398, -0.000)],
163
+ ['OD1', 5, (0.610, 1.091, 0.000)],
164
+ ['OD2', 5, (0.592, -1.101, -0.003)],
165
+ ],
166
+ 'CYS': [
167
+ ['N', 0, (-0.522, 1.362, -0.000)],
168
+ ['CA', 0, (0.000, 0.000, 0.000)],
169
+ ['C', 0, (1.524, 0.000, 0.000)],
170
+ ['CB', 0, (-0.519, -0.773, -1.212)],
171
+ ['O', 3, (0.625, 1.062, -0.000)],
172
+ ['SG', 4, (0.728, 1.653, 0.000)],
173
+ ],
174
+ 'GLN': [
175
+ ['N', 0, (-0.526, 1.361, -0.000)],
176
+ ['CA', 0, (0.000, 0.000, 0.000)],
177
+ ['C', 0, (1.526, 0.000, 0.000)],
178
+ ['CB', 0, (-0.525, -0.779, -1.207)],
179
+ ['O', 3, (0.626, 1.062, -0.000)],
180
+ ['CG', 4, (0.615, 1.393, 0.000)],
181
+ ['CD', 5, (0.587, 1.399, -0.000)],
182
+ ['NE2', 6, (0.593, -1.189, -0.001)],
183
+ ['OE1', 6, (0.634, 1.060, 0.000)],
184
+ ],
185
+ 'GLU': [
186
+ ['N', 0, (-0.528, 1.361, 0.000)],
187
+ ['CA', 0, (0.000, 0.000, 0.000)],
188
+ ['C', 0, (1.526, -0.000, -0.000)],
189
+ ['CB', 0, (-0.526, -0.781, -1.207)],
190
+ ['O', 3, (0.626, 1.062, 0.000)],
191
+ ['CG', 4, (0.615, 1.392, 0.000)],
192
+ ['CD', 5, (0.600, 1.397, 0.000)],
193
+ ['OE1', 6, (0.607, 1.095, -0.000)],
194
+ ['OE2', 6, (0.589, -1.104, -0.001)],
195
+ ],
196
+ 'GLY': [
197
+ ['N', 0, (-0.572, 1.337, 0.000)],
198
+ ['CA', 0, (0.000, 0.000, 0.000)],
199
+ ['C', 0, (1.517, -0.000, -0.000)],
200
+ ['O', 3, (0.626, 1.062, -0.000)],
201
+ ],
202
+ 'HIS': [
203
+ ['N', 0, (-0.527, 1.360, 0.000)],
204
+ ['CA', 0, (0.000, 0.000, 0.000)],
205
+ ['C', 0, (1.525, 0.000, 0.000)],
206
+ ['CB', 0, (-0.525, -0.778, -1.208)],
207
+ ['O', 3, (0.625, 1.063, 0.000)],
208
+ ['CG', 4, (0.600, 1.370, -0.000)],
209
+ ['CD2', 5, (0.889, -1.021, 0.003)],
210
+ ['ND1', 5, (0.744, 1.160, -0.000)],
211
+ ['CE1', 5, (2.030, 0.851, 0.002)],
212
+ ['NE2', 5, (2.145, -0.466, 0.004)],
213
+ ],
214
+ 'ILE': [
215
+ ['N', 0, (-0.493, 1.373, -0.000)],
216
+ ['CA', 0, (0.000, 0.000, 0.000)],
217
+ ['C', 0, (1.527, -0.000, -0.000)],
218
+ ['CB', 0, (-0.536, -0.793, -1.213)],
219
+ ['O', 3, (0.627, 1.062, -0.000)],
220
+ ['CG1', 4, (0.534, 1.437, -0.000)],
221
+ ['CG2', 4, (0.540, -0.785, -1.199)],
222
+ ['CD1', 5, (0.619, 1.391, 0.000)],
223
+ ],
224
+ 'LEU': [
225
+ ['N', 0, (-0.520, 1.363, 0.000)],
226
+ ['CA', 0, (0.000, 0.000, 0.000)],
227
+ ['C', 0, (1.525, -0.000, -0.000)],
228
+ ['CB', 0, (-0.522, -0.773, -1.214)],
229
+ ['O', 3, (0.625, 1.063, -0.000)],
230
+ ['CG', 4, (0.678, 1.371, 0.000)],
231
+ ['CD1', 5, (0.530, 1.430, -0.000)],
232
+ ['CD2', 5, (0.535, -0.774, 1.200)],
233
+ ],
234
+ 'LYS': [
235
+ ['N', 0, (-0.526, 1.362, -0.000)],
236
+ ['CA', 0, (0.000, 0.000, 0.000)],
237
+ ['C', 0, (1.526, 0.000, 0.000)],
238
+ ['CB', 0, (-0.524, -0.778, -1.208)],
239
+ ['O', 3, (0.626, 1.062, -0.000)],
240
+ ['CG', 4, (0.619, 1.390, 0.000)],
241
+ ['CD', 5, (0.559, 1.417, 0.000)],
242
+ ['CE', 6, (0.560, 1.416, 0.000)],
243
+ ['NZ', 7, (0.554, 1.387, 0.000)],
244
+ ],
245
+ 'MET': [
246
+ ['N', 0, (-0.521, 1.364, -0.000)],
247
+ ['CA', 0, (0.000, 0.000, 0.000)],
248
+ ['C', 0, (1.525, 0.000, 0.000)],
249
+ ['CB', 0, (-0.523, -0.776, -1.210)],
250
+ ['O', 3, (0.625, 1.062, -0.000)],
251
+ ['CG', 4, (0.613, 1.391, -0.000)],
252
+ ['SD', 5, (0.703, 1.695, 0.000)],
253
+ ['CE', 6, (0.320, 1.786, -0.000)],
254
+ ],
255
+ 'PHE': [
256
+ ['N', 0, (-0.518, 1.363, 0.000)],
257
+ ['CA', 0, (0.000, 0.000, 0.000)],
258
+ ['C', 0, (1.524, 0.000, -0.000)],
259
+ ['CB', 0, (-0.525, -0.776, -1.212)],
260
+ ['O', 3, (0.626, 1.062, -0.000)],
261
+ ['CG', 4, (0.607, 1.377, 0.000)],
262
+ ['CD1', 5, (0.709, 1.195, -0.000)],
263
+ ['CD2', 5, (0.706, -1.196, 0.000)],
264
+ ['CE1', 5, (2.102, 1.198, -0.000)],
265
+ ['CE2', 5, (2.098, -1.201, -0.000)],
266
+ ['CZ', 5, (2.794, -0.003, -0.001)],
267
+ ],
268
+ 'PRO': [
269
+ ['N', 0, (-0.566, 1.351, -0.000)],
270
+ ['CA', 0, (0.000, 0.000, 0.000)],
271
+ ['C', 0, (1.527, -0.000, 0.000)],
272
+ ['CB', 0, (-0.546, -0.611, -1.293)],
273
+ ['O', 3, (0.621, 1.066, 0.000)],
274
+ ['CG', 4, (0.382, 1.445, 0.0)],
275
+ # ['CD', 5, (0.427, 1.440, 0.0)],
276
+ ['CD', 5, (0.477, 1.424, 0.0)], # manually made angle 2 degrees larger
277
+ ],
278
+ 'SER': [
279
+ ['N', 0, (-0.529, 1.360, -0.000)],
280
+ ['CA', 0, (0.000, 0.000, 0.000)],
281
+ ['C', 0, (1.525, -0.000, -0.000)],
282
+ ['CB', 0, (-0.518, -0.777, -1.211)],
283
+ ['O', 3, (0.626, 1.062, -0.000)],
284
+ ['OG', 4, (0.503, 1.325, 0.000)],
285
+ ],
286
+ 'THR': [
287
+ ['N', 0, (-0.517, 1.364, 0.000)],
288
+ ['CA', 0, (0.000, 0.000, 0.000)],
289
+ ['C', 0, (1.526, 0.000, -0.000)],
290
+ ['CB', 0, (-0.516, -0.793, -1.215)],
291
+ ['O', 3, (0.626, 1.062, 0.000)],
292
+ ['CG2', 4, (0.550, -0.718, -1.228)],
293
+ ['OG1', 4, (0.472, 1.353, 0.000)],
294
+ ],
295
+ 'TRP': [
296
+ ['N', 0, (-0.521, 1.363, 0.000)],
297
+ ['CA', 0, (0.000, 0.000, 0.000)],
298
+ ['C', 0, (1.525, -0.000, 0.000)],
299
+ ['CB', 0, (-0.523, -0.776, -1.212)],
300
+ ['O', 3, (0.627, 1.062, 0.000)],
301
+ ['CG', 4, (0.609, 1.370, -0.000)],
302
+ ['CD1', 5, (0.824, 1.091, 0.000)],
303
+ ['CD2', 5, (0.854, -1.148, -0.005)],
304
+ ['CE2', 5, (2.186, -0.678, -0.007)],
305
+ ['CE3', 5, (0.622, -2.530, -0.007)],
306
+ ['NE1', 5, (2.140, 0.690, -0.004)],
307
+ ['CH2', 5, (3.028, -2.890, -0.013)],
308
+ ['CZ2', 5, (3.283, -1.543, -0.011)],
309
+ ['CZ3', 5, (1.715, -3.389, -0.011)],
310
+ ],
311
+ 'TYR': [
312
+ ['N', 0, (-0.522, 1.362, 0.000)],
313
+ ['CA', 0, (0.000, 0.000, 0.000)],
314
+ ['C', 0, (1.524, -0.000, -0.000)],
315
+ ['CB', 0, (-0.522, -0.776, -1.213)],
316
+ ['O', 3, (0.627, 1.062, -0.000)],
317
+ ['CG', 4, (0.607, 1.382, -0.000)],
318
+ ['CD1', 5, (0.716, 1.195, -0.000)],
319
+ ['CD2', 5, (0.713, -1.194, -0.001)],
320
+ ['CE1', 5, (2.107, 1.200, -0.002)],
321
+ ['CE2', 5, (2.104, -1.201, -0.003)],
322
+ ['OH', 5, (4.168, -0.002, -0.005)],
323
+ ['CZ', 5, (2.791, -0.001, -0.003)],
324
+ ],
325
+ 'VAL': [
326
+ ['N', 0, (-0.494, 1.373, -0.000)],
327
+ ['CA', 0, (0.000, 0.000, 0.000)],
328
+ ['C', 0, (1.527, -0.000, -0.000)],
329
+ ['CB', 0, (-0.533, -0.795, -1.213)],
330
+ ['O', 3, (0.627, 1.062, -0.000)],
331
+ ['CG1', 4, (0.540, 1.429, -0.000)],
332
+ ['CG2', 4, (0.533, -0.776, 1.203)],
333
+ ],
334
+ }
335
+
336
+ # A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
337
+ residue_atoms = {
338
+ 'ALA': ['C', 'CA', 'CB', 'N', 'O'],
339
+ 'ARG': ['C', 'CA', 'CB', 'CG', 'CD', 'CZ', 'N', 'NE', 'O', 'NH1', 'NH2'],
340
+ 'ASP': ['C', 'CA', 'CB', 'CG', 'N', 'O', 'OD1', 'OD2'],
341
+ 'ASN': ['C', 'CA', 'CB', 'CG', 'N', 'ND2', 'O', 'OD1'],
342
+ 'CYS': ['C', 'CA', 'CB', 'N', 'O', 'SG'],
343
+ 'GLU': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O', 'OE1', 'OE2'],
344
+ 'GLN': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'NE2', 'O', 'OE1'],
345
+ 'GLY': ['C', 'CA', 'N', 'O'],
346
+ 'HIS': ['C', 'CA', 'CB', 'CG', 'CD2', 'CE1', 'N', 'ND1', 'NE2', 'O'],
347
+ 'ILE': ['C', 'CA', 'CB', 'CG1', 'CG2', 'CD1', 'N', 'O'],
348
+ 'LEU': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'N', 'O'],
349
+ 'LYS': ['C', 'CA', 'CB', 'CG', 'CD', 'CE', 'N', 'NZ', 'O'],
350
+ 'MET': ['C', 'CA', 'CB', 'CG', 'CE', 'N', 'O', 'SD'],
351
+ 'PHE': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O'],
352
+ 'PRO': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O'],
353
+ 'SER': ['C', 'CA', 'CB', 'N', 'O', 'OG'],
354
+ 'THR': ['C', 'CA', 'CB', 'CG2', 'N', 'O', 'OG1'],
355
+ 'TRP': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE2', 'CE3', 'CZ2', 'CZ3',
356
+ 'CH2', 'N', 'NE1', 'O'],
357
+ 'TYR': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O',
358
+ 'OH'],
359
+ 'VAL': ['C', 'CA', 'CB', 'CG1', 'CG2', 'N', 'O']
360
+ }
361
+
362
+ # Naming swaps for ambiguous atom names.
363
+ # Due to symmetries in the amino acids the naming of atoms is ambiguous in
364
+ # 4 of the 20 amino acids.
365
+ # (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities
366
+ # in LEU, VAL and ARG can be resolved by using the 3d constellations of
367
+ # the 'ambiguous' atoms and their neighbours)
368
+ residue_atom_renaming_swaps = {
369
+ 'ASP': {'OD1': 'OD2'},
370
+ 'GLU': {'OE1': 'OE2'},
371
+ 'PHE': {'CD1': 'CD2', 'CE1': 'CE2'},
372
+ 'TYR': {'CD1': 'CD2', 'CE1': 'CE2'},
373
+ }
374
+
375
+ # Van der Waals radii [Angstroem] of the atoms (from Wikipedia)
376
+ van_der_waals_radius = {
377
+ 'C': 1.7,
378
+ 'N': 1.55,
379
+ 'O': 1.52,
380
+ 'S': 1.8,
381
+ }
382
+
383
+ Bond = collections.namedtuple(
384
+ 'Bond', ['atom1_name', 'atom2_name', 'length', 'stddev'])
385
+ BondAngle = collections.namedtuple(
386
+ 'BondAngle',
387
+ ['atom1_name', 'atom2_name', 'atom3name', 'angle_rad', 'stddev'])
388
+
389
+
390
+ @functools.lru_cache(maxsize=None)
391
+ def load_stereo_chemical_props() -> Tuple[Mapping[str, List[Bond]],
392
+ Mapping[str, List[Bond]],
393
+ Mapping[str, List[BondAngle]]]:
394
+ """Load stereo_chemical_props.txt into a nice structure.
395
+
396
+ Load literature values for bond lengths and bond angles and translate
397
+ bond angles into the length of the opposite edge of the triangle
398
+ ("residue_virtual_bonds").
399
+
400
+ Returns:
401
+ residue_bonds: dict that maps resname --> list of Bond tuples
402
+ residue_virtual_bonds: dict that maps resname --> list of Bond tuples
403
+ residue_bond_angles: dict that maps resname --> list of BondAngle tuples
404
+ """
405
+ stereo_chemical_props_path = (
406
+ 'alphafold/common/stereo_chemical_props.txt')
407
+ with open(stereo_chemical_props_path, 'rt') as f:
408
+ stereo_chemical_props = f.read()
409
+ lines_iter = iter(stereo_chemical_props.splitlines())
410
+ # Load bond lengths.
411
+ residue_bonds = {}
412
+ next(lines_iter) # Skip header line.
413
+ for line in lines_iter:
414
+ if line.strip() == '-':
415
+ break
416
+ bond, resname, length, stddev = line.split()
417
+ atom1, atom2 = bond.split('-')
418
+ if resname not in residue_bonds:
419
+ residue_bonds[resname] = []
420
+ residue_bonds[resname].append(
421
+ Bond(atom1, atom2, float(length), float(stddev)))
422
+ residue_bonds['UNK'] = []
423
+
424
+ # Load bond angles.
425
+ residue_bond_angles = {}
426
+ next(lines_iter) # Skip empty line.
427
+ next(lines_iter) # Skip header line.
428
+ for line in lines_iter:
429
+ if line.strip() == '-':
430
+ break
431
+ bond, resname, angle_degree, stddev_degree = line.split()
432
+ atom1, atom2, atom3 = bond.split('-')
433
+ if resname not in residue_bond_angles:
434
+ residue_bond_angles[resname] = []
435
+ residue_bond_angles[resname].append(
436
+ BondAngle(atom1, atom2, atom3,
437
+ float(angle_degree) / 180. * np.pi,
438
+ float(stddev_degree) / 180. * np.pi))
439
+ residue_bond_angles['UNK'] = []
440
+
441
+ def make_bond_key(atom1_name, atom2_name):
442
+ """Unique key to lookup bonds."""
443
+ return '-'.join(sorted([atom1_name, atom2_name]))
444
+
445
+ # Translate bond angles into distances ("virtual bonds").
446
+ residue_virtual_bonds = {}
447
+ for resname, bond_angles in residue_bond_angles.items():
448
+ # Create a fast lookup dict for bond lengths.
449
+ bond_cache = {}
450
+ for b in residue_bonds[resname]:
451
+ bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b
452
+ residue_virtual_bonds[resname] = []
453
+ for ba in bond_angles:
454
+ bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]
455
+ bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]
456
+
457
+ # Compute distance between atom1 and atom3 using the law of cosines
458
+ # c^2 = a^2 + b^2 - 2ab*cos(gamma).
459
+ gamma = ba.angle_rad
460
+ length = np.sqrt(bond1.length**2 + bond2.length**2
461
+ - 2 * bond1.length * bond2.length * np.cos(gamma))
462
+
463
+ # Propagation of uncertainty assuming uncorrelated errors.
464
+ dl_outer = 0.5 / length
465
+ dl_dgamma = (2 * bond1.length * bond2.length * np.sin(gamma)) * dl_outer
466
+ dl_db1 = (2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer
467
+ dl_db2 = (2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer
468
+ stddev = np.sqrt((dl_dgamma * ba.stddev)**2 +
469
+ (dl_db1 * bond1.stddev)**2 +
470
+ (dl_db2 * bond2.stddev)**2)
471
+ residue_virtual_bonds[resname].append(
472
+ Bond(ba.atom1_name, ba.atom3name, length, stddev))
473
+
474
+ return (residue_bonds,
475
+ residue_virtual_bonds,
476
+ residue_bond_angles)
477
+
478
+
479
+ # Between-residue bond lengths for general bonds (first element) and for Proline
480
+ # (second element).
481
+ between_res_bond_length_c_n = [1.329, 1.341]
482
+ between_res_bond_length_stddev_c_n = [0.014, 0.016]
483
+
484
+ # Between-residue cos_angles.
485
+ between_res_cos_angles_c_n_ca = [-0.5203, 0.0353] # degrees: 121.352 +- 2.315
486
+ between_res_cos_angles_ca_c_n = [-0.4473, 0.0311] # degrees: 116.568 +- 1.995
487
+
488
+ # This mapping is used when we need to store atom data in a format that requires
489
+ # fixed atom data size for every residue (e.g. a numpy array).
490
+ atom_types = [
491
+ 'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
492
+ 'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
493
+ 'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
494
+ 'CZ3', 'NZ', 'OXT'
495
+ ]
496
+ atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
497
+ atom_type_num = len(atom_types) # := 37.
498
+
499
+ # A compact atom encoding with 14 columns
500
+ # pylint: disable=line-too-long
501
+ # pylint: disable=bad-whitespace
502
+ restype_name_to_atom14_names = {
503
+ 'ALA': ['N', 'CA', 'C', 'O', 'CB', '', '', '', '', '', '', '', '', ''],
504
+ 'ARG': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'NE', 'CZ', 'NH1', 'NH2', '', '', ''],
505
+ 'ASN': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'ND2', '', '', '', '', '', ''],
506
+ 'ASP': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'OD2', '', '', '', '', '', ''],
507
+ 'CYS': ['N', 'CA', 'C', 'O', 'CB', 'SG', '', '', '', '', '', '', '', ''],
508
+ 'GLN': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'NE2', '', '', '', '', ''],
509
+ 'GLU': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'OE2', '', '', '', '', ''],
510
+ 'GLY': ['N', 'CA', 'C', 'O', '', '', '', '', '', '', '', '', '', ''],
511
+ 'HIS': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'ND1', 'CD2', 'CE1', 'NE2', '', '', '', ''],
512
+ 'ILE': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'CD1', '', '', '', '', '', ''],
513
+ 'LEU': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', '', '', '', '', '', ''],
514
+ 'LYS': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'CE', 'NZ', '', '', '', '', ''],
515
+ 'MET': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'SD', 'CE', '', '', '', '', '', ''],
516
+ 'PHE': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', '', '', ''],
517
+ 'PRO': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', '', '', '', '', '', '', ''],
518
+ 'SER': ['N', 'CA', 'C', 'O', 'CB', 'OG', '', '', '', '', '', '', '', ''],
519
+ 'THR': ['N', 'CA', 'C', 'O', 'CB', 'OG1', 'CG2', '', '', '', '', '', '', ''],
520
+ 'TRP': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'NE1', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2'],
521
+ 'TYR': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'OH', '', ''],
522
+ 'VAL': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', '', '', '', '', '', '', ''],
523
+ 'UNK': ['', '', '', '', '', '', '', '', '', '', '', '', '', ''],
524
+
525
+ }
526
+ # pylint: enable=line-too-long
527
+ # pylint: enable=bad-whitespace
528
+
529
+
530
+ # This is the standard residue order when coding AA type as a number.
531
+ # Reproduce it by taking 3-letter AA codes and sorting them alphabetically.
532
+ restypes = [
533
+ 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P',
534
+ 'S', 'T', 'W', 'Y', 'V'
535
+ ]
536
+ restype_order = {restype: i for i, restype in enumerate(restypes)}
537
+ restype_num = len(restypes) # := 20.
538
+ unk_restype_index = restype_num # Catch-all index for unknown restypes.
539
+
540
+ restypes_with_x = restypes + ['X']
541
+ restype_order_with_x = {restype: i for i, restype in enumerate(restypes_with_x)}
542
+
543
+
544
+ def sequence_to_onehot(
545
+ sequence: str,
546
+ mapping: Mapping[str, int],
547
+ map_unknown_to_x: bool = False) -> np.ndarray:
548
+ """Maps the given sequence into a one-hot encoded matrix.
549
+
550
+ Args:
551
+ sequence: An amino acid sequence.
552
+ mapping: A dictionary mapping amino acids to integers.
553
+ map_unknown_to_x: If True, any amino acid that is not in the mapping will be
554
+ mapped to the unknown amino acid 'X'. If the mapping doesn't contain
555
+ amino acid 'X', an error will be thrown. If False, any amino acid not in
556
+ the mapping will throw an error.
557
+
558
+ Returns:
559
+ A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
560
+ the sequence.
561
+
562
+ Raises:
563
+ ValueError: If the mapping doesn't contain values from 0 to
564
+ num_unique_aas - 1 without any gaps.
565
+ """
566
+ num_entries = max(mapping.values()) + 1
567
+
568
+ if sorted(set(mapping.values())) != list(range(num_entries)):
569
+ raise ValueError('The mapping must have values from 0 to num_unique_aas-1 '
570
+ 'without any gaps. Got: %s' % sorted(mapping.values()))
571
+
572
+ one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)
573
+
574
+ for aa_index, aa_type in enumerate(sequence):
575
+ if map_unknown_to_x:
576
+ if aa_type.isalpha() and aa_type.isupper():
577
+ aa_id = mapping.get(aa_type, mapping['X'])
578
+ else:
579
+ raise ValueError(f'Invalid character in the sequence: {aa_type}')
580
+ else:
581
+ aa_id = mapping[aa_type]
582
+ one_hot_arr[aa_index, aa_id] = 1
583
+
584
+ return one_hot_arr
585
+
586
+
587
+ restype_1to3 = {
588
+ 'A': 'ALA',
589
+ 'R': 'ARG',
590
+ 'N': 'ASN',
591
+ 'D': 'ASP',
592
+ 'C': 'CYS',
593
+ 'Q': 'GLN',
594
+ 'E': 'GLU',
595
+ 'G': 'GLY',
596
+ 'H': 'HIS',
597
+ 'I': 'ILE',
598
+ 'L': 'LEU',
599
+ 'K': 'LYS',
600
+ 'M': 'MET',
601
+ 'F': 'PHE',
602
+ 'P': 'PRO',
603
+ 'S': 'SER',
604
+ 'T': 'THR',
605
+ 'W': 'TRP',
606
+ 'Y': 'TYR',
607
+ 'V': 'VAL',
608
+ }
609
+
610
+
611
+ # NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
612
+ # 1-to-1 mapping of 3 letter names to one letter names. The latter contains
613
+ # many more, and less common, three letter names as keys and maps many of these
614
+ # to the same one letter name (including 'X' and 'U' which we don't use here).
615
+ restype_3to1 = {v: k for k, v in restype_1to3.items()}
616
+
617
+ # Define a restype name for all unknown residues.
618
+ unk_restype = 'UNK'
619
+
620
+ resnames = [restype_1to3[r] for r in restypes] + [unk_restype]
621
+ resname_to_idx = {resname: i for i, resname in enumerate(resnames)}
622
+
623
+
624
+ # The mapping here uses hhblits convention, so that B is mapped to D, J and O
625
+ # are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the
626
+ # remaining 20 amino acids are kept in alphabetical order.
627
+ # There are 2 non-amino acid codes, X (representing any amino acid) and
628
+ # "-" representing a missing amino acid in an alignment. The id for these
629
+ # codes is put at the end (20 and 21) so that they can easily be ignored if
630
+ # desired.
631
+ HHBLITS_AA_TO_ID = {
632
+ 'A': 0,
633
+ 'B': 2,
634
+ 'C': 1,
635
+ 'D': 2,
636
+ 'E': 3,
637
+ 'F': 4,
638
+ 'G': 5,
639
+ 'H': 6,
640
+ 'I': 7,
641
+ 'J': 20,
642
+ 'K': 8,
643
+ 'L': 9,
644
+ 'M': 10,
645
+ 'N': 11,
646
+ 'O': 20,
647
+ 'P': 12,
648
+ 'Q': 13,
649
+ 'R': 14,
650
+ 'S': 15,
651
+ 'T': 16,
652
+ 'U': 1,
653
+ 'V': 17,
654
+ 'W': 18,
655
+ 'X': 20,
656
+ 'Y': 19,
657
+ 'Z': 3,
658
+ '-': 21,
659
+ }
660
+
661
+ # Partial inversion of HHBLITS_AA_TO_ID.
662
+ ID_TO_HHBLITS_AA = {
663
+ 0: 'A',
664
+ 1: 'C', # Also U.
665
+ 2: 'D', # Also B.
666
+ 3: 'E', # Also Z.
667
+ 4: 'F',
668
+ 5: 'G',
669
+ 6: 'H',
670
+ 7: 'I',
671
+ 8: 'K',
672
+ 9: 'L',
673
+ 10: 'M',
674
+ 11: 'N',
675
+ 12: 'P',
676
+ 13: 'Q',
677
+ 14: 'R',
678
+ 15: 'S',
679
+ 16: 'T',
680
+ 17: 'V',
681
+ 18: 'W',
682
+ 19: 'Y',
683
+ 20: 'X', # Includes J and O.
684
+ 21: '-',
685
+ }
686
+
687
+ restypes_with_x_and_gap = restypes + ['X', '-']
688
+ MAP_HHBLITS_AATYPE_TO_OUR_AATYPE = tuple(
689
+ restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i])
690
+ for i in range(len(restypes_with_x_and_gap)))
691
+
692
+
693
+ def _make_standard_atom_mask() -> np.ndarray:
694
+ """Returns [num_res_types, num_atom_types] mask array."""
695
+ # +1 to account for unknown (all 0s).
696
+ mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32)
697
+ for restype, restype_letter in enumerate(restypes):
698
+ restype_name = restype_1to3[restype_letter]
699
+ atom_names = residue_atoms[restype_name]
700
+ for atom_name in atom_names:
701
+ atom_type = atom_order[atom_name]
702
+ mask[restype, atom_type] = 1
703
+ return mask
704
+
705
+
706
+ STANDARD_ATOM_MASK = _make_standard_atom_mask()
707
+
708
+
709
+ # A one hot representation for the first and second atoms defining the axis
710
+ # of rotation for each chi-angle in each residue.
711
+ def chi_angle_atom(atom_index: int) -> np.ndarray:
712
+ """Define chi-angle rigid groups via one-hot representations."""
713
+ chi_angles_index = {}
714
+ one_hots = []
715
+
716
+ for k, v in chi_angles_atoms.items():
717
+ indices = [atom_types.index(s[atom_index]) for s in v]
718
+ indices.extend([-1]*(4-len(indices)))
719
+ chi_angles_index[k] = indices
720
+
721
+ for r in restypes:
722
+ res3 = restype_1to3[r]
723
+ one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]
724
+ one_hots.append(one_hot)
725
+
726
+ one_hots.append(np.zeros([4, atom_type_num])) # Add zeros for residue `X`.
727
+ one_hot = np.stack(one_hots, axis=0)
728
+ one_hot = np.transpose(one_hot, [0, 2, 1])
729
+
730
+ return one_hot
731
+
732
+ chi_atom_1_one_hot = chi_angle_atom(1)
733
+ chi_atom_2_one_hot = chi_angle_atom(2)
734
+
735
+ # An array like chi_angles_atoms but using indices rather than names.
736
+ chi_angles_atom_indices = [chi_angles_atoms[restype_1to3[r]] for r in restypes]
737
+ chi_angles_atom_indices = tree.map_structure(
738
+ lambda atom_name: atom_order[atom_name], chi_angles_atom_indices)
739
+ chi_angles_atom_indices = np.array([
740
+ chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms)))
741
+ for chi_atoms in chi_angles_atom_indices])
742
+
743
+ # Mapping from (res_name, atom_name) pairs to the atom's chi group index
744
+ # and atom index within that group.
745
+ chi_groups_for_atom = collections.defaultdict(list)
746
+ for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():
747
+ for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):
748
+ for atom_i, atom in enumerate(chi_group):
749
+ chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))
750
+ chi_groups_for_atom = dict(chi_groups_for_atom)
751
+
752
+
753
+ def _make_rigid_transformation_4x4(ex, ey, translation):
754
+ """Create a rigid 4x4 transformation matrix from two axes and transl."""
755
+ # Normalize ex.
756
+ ex_normalized = ex / np.linalg.norm(ex)
757
+
758
+ # make ey perpendicular to ex
759
+ ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized
760
+ ey_normalized /= np.linalg.norm(ey_normalized)
761
+
762
+ # compute ez as cross product
763
+ eznorm = np.cross(ex_normalized, ey_normalized)
764
+ m = np.stack([ex_normalized, ey_normalized, eznorm, translation]).transpose()
765
+ m = np.concatenate([m, [[0., 0., 0., 1.]]], axis=0)
766
+ return m
767
+
768
+
769
+ # create an array with (restype, atomtype) --> rigid_group_idx
770
+ # and an array with (restype, atomtype, coord) for the atom positions
771
+ # and compute affine transformation matrices (4,4) from one rigid group to the
772
+ # previous group
773
+ restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=np.int)
774
+ restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
775
+ restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)
776
+ restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=np.int)
777
+ restype_atom14_mask = np.zeros([21, 14], dtype=np.float32)
778
+ restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)
779
+ restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)
780
+
781
+
782
+ def _make_rigid_group_constants():
783
+ """Fill the arrays above."""
784
+ for restype, restype_letter in enumerate(restypes):
785
+ resname = restype_1to3[restype_letter]
786
+ for atomname, group_idx, atom_position in rigid_group_atom_positions[
787
+ resname]:
788
+ atomtype = atom_order[atomname]
789
+ restype_atom37_to_rigid_group[restype, atomtype] = group_idx
790
+ restype_atom37_mask[restype, atomtype] = 1
791
+ restype_atom37_rigid_group_positions[restype, atomtype, :] = atom_position
792
+
793
+ atom14idx = restype_name_to_atom14_names[resname].index(atomname)
794
+ restype_atom14_to_rigid_group[restype, atom14idx] = group_idx
795
+ restype_atom14_mask[restype, atom14idx] = 1
796
+ restype_atom14_rigid_group_positions[restype,
797
+ atom14idx, :] = atom_position
798
+
799
+ for restype, restype_letter in enumerate(restypes):
800
+ resname = restype_1to3[restype_letter]
801
+ atom_positions = {name: np.array(pos) for name, _, pos
802
+ in rigid_group_atom_positions[resname]}
803
+
804
+ # backbone to backbone is the identity transform
805
+ restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)
806
+
807
+ # pre-omega-frame to backbone (currently dummy identity matrix)
808
+ restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)
809
+
810
+ # phi-frame to backbone
811
+ mat = _make_rigid_transformation_4x4(
812
+ ex=atom_positions['N'] - atom_positions['CA'],
813
+ ey=np.array([1., 0., 0.]),
814
+ translation=atom_positions['N'])
815
+ restype_rigid_group_default_frame[restype, 2, :, :] = mat
816
+
817
+ # psi-frame to backbone
818
+ mat = _make_rigid_transformation_4x4(
819
+ ex=atom_positions['C'] - atom_positions['CA'],
820
+ ey=atom_positions['CA'] - atom_positions['N'],
821
+ translation=atom_positions['C'])
822
+ restype_rigid_group_default_frame[restype, 3, :, :] = mat
823
+
824
+ # chi1-frame to backbone
825
+ if chi_angles_mask[restype][0]:
826
+ base_atom_names = chi_angles_atoms[resname][0]
827
+ base_atom_positions = [atom_positions[name] for name in base_atom_names]
828
+ mat = _make_rigid_transformation_4x4(
829
+ ex=base_atom_positions[2] - base_atom_positions[1],
830
+ ey=base_atom_positions[0] - base_atom_positions[1],
831
+ translation=base_atom_positions[2])
832
+ restype_rigid_group_default_frame[restype, 4, :, :] = mat
833
+
834
+ # chi2-frame to chi1-frame
835
+ # chi3-frame to chi2-frame
836
+ # chi4-frame to chi3-frame
837
+ # luckily all rotation axes for the next frame start at (0,0,0) of the
838
+ # previous frame
839
+ for chi_idx in range(1, 4):
840
+ if chi_angles_mask[restype][chi_idx]:
841
+ axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]
842
+ axis_end_atom_position = atom_positions[axis_end_atom_name]
843
+ mat = _make_rigid_transformation_4x4(
844
+ ex=axis_end_atom_position,
845
+ ey=np.array([-1., 0., 0.]),
846
+ translation=axis_end_atom_position)
847
+ restype_rigid_group_default_frame[restype, 4 + chi_idx, :, :] = mat
848
+
849
+
850
+ _make_rigid_group_constants()
851
+
852
+
853
+ def make_atom14_dists_bounds(overlap_tolerance=1.5,
854
+ bond_length_tolerance_factor=15):
855
+ """compute upper and lower bounds for bonds to assess violations."""
856
+ restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)
857
+ restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)
858
+ restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)
859
+ residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()
860
+ for restype, restype_letter in enumerate(restypes):
861
+ resname = restype_1to3[restype_letter]
862
+ atom_list = restype_name_to_atom14_names[resname]
863
+
864
+ # create lower and upper bounds for clashes
865
+ for atom1_idx, atom1_name in enumerate(atom_list):
866
+ if not atom1_name:
867
+ continue
868
+ atom1_radius = van_der_waals_radius[atom1_name[0]]
869
+ for atom2_idx, atom2_name in enumerate(atom_list):
870
+ if (not atom2_name) or atom1_idx == atom2_idx:
871
+ continue
872
+ atom2_radius = van_der_waals_radius[atom2_name[0]]
873
+ lower = atom1_radius + atom2_radius - overlap_tolerance
874
+ upper = 1e10
875
+ restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
876
+ restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
877
+ restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
878
+ restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
879
+
880
+ # overwrite lower and upper bounds for bonds and angles
881
+ for b in residue_bonds[resname] + residue_virtual_bonds[resname]:
882
+ atom1_idx = atom_list.index(b.atom1_name)
883
+ atom2_idx = atom_list.index(b.atom2_name)
884
+ lower = b.length - bond_length_tolerance_factor * b.stddev
885
+ upper = b.length + bond_length_tolerance_factor * b.stddev
886
+ restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
887
+ restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
888
+ restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
889
+ restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
890
+ restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev
891
+ restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev
892
+ return {'lower_bound': restype_atom14_bond_lower_bound, # shape (21,14,14)
893
+ 'upper_bound': restype_atom14_bond_upper_bound, # shape (21,14,14)
894
+ 'stddev': restype_atom14_bond_stddev, # shape (21,14,14)
895
+ }
alphafold/alphafold/common/residue_constants_test.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Test that residue_constants generates correct values."""
16
+
17
+ from absl.testing import absltest
18
+ from absl.testing import parameterized
19
+ from alphafold.common import residue_constants
20
+ import numpy as np
21
+
22
+
23
+ class ResidueConstantsTest(parameterized.TestCase):
24
+
25
+ @parameterized.parameters(
26
+ ('ALA', 0),
27
+ ('CYS', 1),
28
+ ('HIS', 2),
29
+ ('MET', 3),
30
+ ('LYS', 4),
31
+ ('ARG', 4),
32
+ )
33
+ def testChiAnglesAtoms(self, residue_name, chi_num):
34
+ chi_angles_atoms = residue_constants.chi_angles_atoms[residue_name]
35
+ self.assertLen(chi_angles_atoms, chi_num)
36
+ for chi_angle_atoms in chi_angles_atoms:
37
+ self.assertLen(chi_angle_atoms, 4)
38
+
39
+ def testChiGroupsForAtom(self):
40
+ for k, chi_groups in residue_constants.chi_groups_for_atom.items():
41
+ res_name, atom_name = k
42
+ for chi_group_i, atom_i in chi_groups:
43
+ self.assertEqual(
44
+ atom_name,
45
+ residue_constants.chi_angles_atoms[res_name][chi_group_i][atom_i])
46
+
47
+ @parameterized.parameters(
48
+ ('ALA', 5), ('ARG', 11), ('ASN', 8), ('ASP', 8), ('CYS', 6), ('GLN', 9),
49
+ ('GLU', 9), ('GLY', 4), ('HIS', 10), ('ILE', 8), ('LEU', 8), ('LYS', 9),
50
+ ('MET', 8), ('PHE', 11), ('PRO', 7), ('SER', 6), ('THR', 7), ('TRP', 14),
51
+ ('TYR', 12), ('VAL', 7)
52
+ )
53
+ def testResidueAtoms(self, atom_name, num_residue_atoms):
54
+ residue_atoms = residue_constants.residue_atoms[atom_name]
55
+ self.assertLen(residue_atoms, num_residue_atoms)
56
+
57
+ def testStandardAtomMask(self):
58
+ with self.subTest('Check shape'):
59
+ self.assertEqual(residue_constants.STANDARD_ATOM_MASK.shape, (21, 37,))
60
+
61
+ with self.subTest('Check values'):
62
+ str_to_row = lambda s: [c == '1' for c in s] # More clear/concise.
63
+ np.testing.assert_array_equal(
64
+ residue_constants.STANDARD_ATOM_MASK,
65
+ np.array([
66
+ # NB This was defined by c+p but looks sane.
67
+ str_to_row('11111 '), # ALA
68
+ str_to_row('111111 1 1 11 1 '), # ARG
69
+ str_to_row('111111 11 '), # ASP
70
+ str_to_row('111111 11 '), # ASN
71
+ str_to_row('11111 1 '), # CYS
72
+ str_to_row('111111 1 11 '), # GLU
73
+ str_to_row('111111 1 11 '), # GLN
74
+ str_to_row('111 1 '), # GLY
75
+ str_to_row('111111 11 1 1 '), # HIS
76
+ str_to_row('11111 11 1 '), # ILE
77
+ str_to_row('111111 11 '), # LEU
78
+ str_to_row('111111 1 1 1 '), # LYS
79
+ str_to_row('111111 11 '), # MET
80
+ str_to_row('111111 11 11 1 '), # PHE
81
+ str_to_row('111111 1 '), # PRO
82
+ str_to_row('11111 1 '), # SER
83
+ str_to_row('11111 1 1 '), # THR
84
+ str_to_row('111111 11 11 1 1 11 '), # TRP
85
+ str_to_row('111111 11 11 11 '), # TYR
86
+ str_to_row('11111 11 '), # VAL
87
+ str_to_row(' '), # UNK
88
+ ]))
89
+
90
+ with self.subTest('Check row totals'):
91
+ # Check each row has the right number of atoms.
92
+ for row, restype in enumerate(residue_constants.restypes): # A, R, ...
93
+ long_restype = residue_constants.restype_1to3[restype] # ALA, ARG, ...
94
+ atoms_names = residue_constants.residue_atoms[
95
+ long_restype] # ['C', 'CA', 'CB', 'N', 'O'], ...
96
+ self.assertLen(atoms_names,
97
+ residue_constants.STANDARD_ATOM_MASK[row, :].sum(),
98
+ long_restype)
99
+
100
+ def testAtomTypes(self):
101
+ self.assertEqual(residue_constants.atom_type_num, 37)
102
+
103
+ self.assertEqual(residue_constants.atom_types[0], 'N')
104
+ self.assertEqual(residue_constants.atom_types[1], 'CA')
105
+ self.assertEqual(residue_constants.atom_types[2], 'C')
106
+ self.assertEqual(residue_constants.atom_types[3], 'CB')
107
+ self.assertEqual(residue_constants.atom_types[4], 'O')
108
+
109
+ self.assertEqual(residue_constants.atom_order['N'], 0)
110
+ self.assertEqual(residue_constants.atom_order['CA'], 1)
111
+ self.assertEqual(residue_constants.atom_order['C'], 2)
112
+ self.assertEqual(residue_constants.atom_order['CB'], 3)
113
+ self.assertEqual(residue_constants.atom_order['O'], 4)
114
+ self.assertEqual(residue_constants.atom_type_num, 37)
115
+
116
+ def testRestypes(self):
117
+ three_letter_restypes = [
118
+ residue_constants.restype_1to3[r] for r in residue_constants.restypes]
119
+ for restype, exp_restype in zip(
120
+ three_letter_restypes, sorted(residue_constants.restype_1to3.values())):
121
+ self.assertEqual(restype, exp_restype)
122
+ self.assertEqual(residue_constants.restype_num, 20)
123
+
124
+ def testSequenceToOneHotHHBlits(self):
125
+ one_hot = residue_constants.sequence_to_onehot(
126
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZ-', residue_constants.HHBLITS_AA_TO_ID)
127
+ exp_one_hot = np.array(
128
+ [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
129
+ [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
130
+ [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
131
+ [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
132
+ [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
133
+ [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
134
+ [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
135
+ [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
136
+ [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
137
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
138
+ [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
139
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
140
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
141
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
142
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
143
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
144
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
145
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
146
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
147
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
148
+ [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
149
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
150
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
151
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
152
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
153
+ [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
154
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
155
+ np.testing.assert_array_equal(one_hot, exp_one_hot)
156
+
157
+ def testSequenceToOneHotStandard(self):
158
+ one_hot = residue_constants.sequence_to_onehot(
159
+ 'ARNDCQEGHILKMFPSTWYV', residue_constants.restype_order)
160
+ np.testing.assert_array_equal(one_hot, np.eye(20))
161
+
162
+ def testSequenceToOneHotUnknownMapping(self):
163
+ seq = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
164
+ expected_out = np.zeros([26, 21])
165
+ for row, position in enumerate(
166
+ [0, 20, 4, 3, 6, 13, 7, 8, 9, 20, 11, 10, 12, 2, 20, 14, 5, 1, 15, 16,
167
+ 20, 19, 17, 20, 18, 20]):
168
+ expected_out[row, position] = 1
169
+ aa_types = residue_constants.sequence_to_onehot(
170
+ sequence=seq,
171
+ mapping=residue_constants.restype_order_with_x,
172
+ map_unknown_to_x=True)
173
+ self.assertTrue((aa_types == expected_out).all())
174
+
175
+ @parameterized.named_parameters(
176
+ ('lowercase', 'aaa'), # Insertions in A3M.
177
+ ('gaps', '---'), # Gaps in A3M.
178
+ ('dots', '...'), # Gaps in A3M.
179
+ ('metadata', '>TEST'), # FASTA metadata line.
180
+ )
181
+ def testSequenceToOneHotUnknownMappingError(self, seq):
182
+ with self.assertRaises(ValueError):
183
+ residue_constants.sequence_to_onehot(
184
+ sequence=seq,
185
+ mapping=residue_constants.restype_order_with_x,
186
+ map_unknown_to_x=True)
187
+
188
+
189
+ if __name__ == '__main__':
190
+ absltest.main()
alphafold/alphafold/common/testdata/2rbg.pdb ADDED
The diff for this file is too large to render. See raw diff
 
alphafold/alphafold/data/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Data pipeline for model features."""
alphafold/alphafold/data/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (191 Bytes). View file
 
alphafold/alphafold/data/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (208 Bytes). View file
 
alphafold/alphafold/data/__pycache__/mmcif_parsing.cpython-36.pyc ADDED
Binary file (11.4 kB). View file
 
alphafold/alphafold/data/__pycache__/mmcif_parsing.cpython-38.pyc ADDED
Binary file (11.5 kB). View file
 
alphafold/alphafold/data/__pycache__/parsers.cpython-36.pyc ADDED
Binary file (9.79 kB). View file
 
alphafold/alphafold/data/__pycache__/parsers.cpython-38.pyc ADDED
Binary file (9.9 kB). View file
 
alphafold/alphafold/data/__pycache__/pipeline.cpython-36.pyc ADDED
Binary file (5.88 kB). View file
 
alphafold/alphafold/data/__pycache__/pipeline.cpython-38.pyc ADDED
Binary file (6.02 kB). View file
 
alphafold/alphafold/data/__pycache__/templates.cpython-36.pyc ADDED
Binary file (27.7 kB). View file
 
alphafold/alphafold/data/__pycache__/templates.cpython-38.pyc ADDED
Binary file (27.9 kB). View file
 
alphafold/alphafold/data/mmcif_parsing.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Parses the mmCIF file format."""
16
+ import collections
17
+ import dataclasses
18
+ import io
19
+ from typing import Any, Mapping, Optional, Sequence, Tuple
20
+
21
+ from absl import logging
22
+ from Bio import PDB
23
+ from Bio.Data import SCOPData
24
+
25
+ # Type aliases:
26
+ ChainId = str
27
+ PdbHeader = Mapping[str, Any]
28
+ PdbStructure = PDB.Structure.Structure
29
+ SeqRes = str
30
+ MmCIFDict = Mapping[str, Sequence[str]]
31
+
32
+
33
+ @dataclasses.dataclass(frozen=True)
34
+ class Monomer:
35
+ id: str
36
+ num: int
37
+
38
+
39
+ # Note - mmCIF format provides no guarantees on the type of author-assigned
40
+ # sequence numbers. They need not be integers.
41
+ @dataclasses.dataclass(frozen=True)
42
+ class AtomSite:
43
+ residue_name: str
44
+ author_chain_id: str
45
+ mmcif_chain_id: str
46
+ author_seq_num: str
47
+ mmcif_seq_num: int
48
+ insertion_code: str
49
+ hetatm_atom: str
50
+ model_num: int
51
+
52
+
53
+ # Used to map SEQRES index to a residue in the structure.
54
+ @dataclasses.dataclass(frozen=True)
55
+ class ResiduePosition:
56
+ chain_id: str
57
+ residue_number: int
58
+ insertion_code: str
59
+
60
+
61
+ @dataclasses.dataclass(frozen=True)
62
+ class ResidueAtPosition:
63
+ position: Optional[ResiduePosition]
64
+ name: str
65
+ is_missing: bool
66
+ hetflag: str
67
+
68
+
69
+ @dataclasses.dataclass(frozen=True)
70
+ class MmcifObject:
71
+ """Representation of a parsed mmCIF file.
72
+
73
+ Contains:
74
+ file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
75
+ files being processed.
76
+ header: Biopython header.
77
+ structure: Biopython structure.
78
+ chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
79
+ {'A': 'ABCDEFG'}
80
+ seqres_to_structure: Dict; for each chain_id contains a mapping between
81
+ SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,
82
+ 1: ResidueAtPosition,
83
+ ...}}
84
+ raw_string: The raw string used to construct the MmcifObject.
85
+ """
86
+ file_id: str
87
+ header: PdbHeader
88
+ structure: PdbStructure
89
+ chain_to_seqres: Mapping[ChainId, SeqRes]
90
+ seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
91
+ raw_string: Any
92
+
93
+
94
+ @dataclasses.dataclass(frozen=True)
95
+ class ParsingResult:
96
+ """Returned by the parse function.
97
+
98
+ Contains:
99
+ mmcif_object: A MmcifObject, may be None if no chain could be successfully
100
+ parsed.
101
+ errors: A dict mapping (file_id, chain_id) to any exception generated.
102
+ """
103
+ mmcif_object: Optional[MmcifObject]
104
+ errors: Mapping[Tuple[str, str], Any]
105
+
106
+
107
+ class ParseError(Exception):
108
+ """An error indicating that an mmCIF file could not be parsed."""
109
+
110
+
111
+ def mmcif_loop_to_list(prefix: str,
112
+ parsed_info: MmCIFDict) -> Sequence[Mapping[str, str]]:
113
+ """Extracts loop associated with a prefix from mmCIF data as a list.
114
+
115
+ Reference for loop_ in mmCIF:
116
+ http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html
117
+
118
+ Args:
119
+ prefix: Prefix shared by each of the data items in the loop.
120
+ e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
121
+ _entity_poly_seq.mon_id. Should include the trailing period.
122
+ parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
123
+ parser.
124
+
125
+ Returns:
126
+ Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
127
+ """
128
+ cols = []
129
+ data = []
130
+ for key, value in parsed_info.items():
131
+ if key.startswith(prefix):
132
+ cols.append(key)
133
+ data.append(value)
134
+
135
+ assert all([len(xs) == len(data[0]) for xs in data]), (
136
+ 'mmCIF error: Not all loops are the same length: %s' % cols)
137
+
138
+ return [dict(zip(cols, xs)) for xs in zip(*data)]
139
+
140
+
141
+ def mmcif_loop_to_dict(prefix: str,
142
+ index: str,
143
+ parsed_info: MmCIFDict,
144
+ ) -> Mapping[str, Mapping[str, str]]:
145
+ """Extracts loop associated with a prefix from mmCIF data as a dictionary.
146
+
147
+ Args:
148
+ prefix: Prefix shared by each of the data items in the loop.
149
+ e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
150
+ _entity_poly_seq.mon_id. Should include the trailing period.
151
+ index: Which item of loop data should serve as the key.
152
+ parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
153
+ parser.
154
+
155
+ Returns:
156
+ Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
157
+ indexed by the index column.
158
+ """
159
+ entries = mmcif_loop_to_list(prefix, parsed_info)
160
+ return {entry[index]: entry for entry in entries}
161
+
162
+
163
+ def parse(*,
164
+ file_id: str,
165
+ mmcif_string: str,
166
+ catch_all_errors: bool = True) -> ParsingResult:
167
+ """Entry point, parses an mmcif_string.
168
+
169
+ Args:
170
+ file_id: A string identifier for this file. Should be unique within the
171
+ collection of files being processed.
172
+ mmcif_string: Contents of an mmCIF file.
173
+ catch_all_errors: If True, all exceptions are caught and error messages are
174
+ returned as part of the ParsingResult. If False exceptions will be allowed
175
+ to propagate.
176
+
177
+ Returns:
178
+ A ParsingResult.
179
+ """
180
+ errors = {}
181
+ try:
182
+ parser = PDB.MMCIFParser(QUIET=True)
183
+ handle = io.StringIO(mmcif_string)
184
+ full_structure = parser.get_structure('', handle)
185
+ first_model_structure = _get_first_model(full_structure)
186
+ # Extract the _mmcif_dict from the parser, which contains useful fields not
187
+ # reflected in the Biopython structure.
188
+ parsed_info = parser._mmcif_dict # pylint:disable=protected-access
189
+
190
+ # Ensure all values are lists, even if singletons.
191
+ for key, value in parsed_info.items():
192
+ if not isinstance(value, list):
193
+ parsed_info[key] = [value]
194
+
195
+ header = _get_header(parsed_info)
196
+
197
+ # Determine the protein chains, and their start numbers according to the
198
+ # internal mmCIF numbering scheme (likely but not guaranteed to be 1).
199
+ valid_chains = _get_protein_chains(parsed_info=parsed_info)
200
+ if not valid_chains:
201
+ return ParsingResult(
202
+ None, {(file_id, ''): 'No protein chains found in this file.'})
203
+ seq_start_num = {chain_id: min([monomer.num for monomer in seq])
204
+ for chain_id, seq in valid_chains.items()}
205
+
206
+ # Loop over the atoms for which we have coordinates. Populate two mappings:
207
+ # -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used
208
+ # the authors / Biopython).
209
+ # -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition).
210
+ mmcif_to_author_chain_id = {}
211
+ seq_to_structure_mappings = {}
212
+ for atom in _get_atom_site_list(parsed_info):
213
+ if atom.model_num != '1':
214
+ # We only process the first model at the moment.
215
+ continue
216
+
217
+ mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id
218
+
219
+ if atom.mmcif_chain_id in valid_chains:
220
+ hetflag = ' '
221
+ if atom.hetatm_atom == 'HETATM':
222
+ # Water atoms are assigned a special hetflag of W in Biopython. We
223
+ # need to do the same, so that this hetflag can be used to fetch
224
+ # a residue from the Biopython structure by id.
225
+ if atom.residue_name in ('HOH', 'WAT'):
226
+ hetflag = 'W'
227
+ else:
228
+ hetflag = 'H_' + atom.residue_name
229
+ insertion_code = atom.insertion_code
230
+ if not _is_set(atom.insertion_code):
231
+ insertion_code = ' '
232
+ position = ResiduePosition(chain_id=atom.author_chain_id,
233
+ residue_number=int(atom.author_seq_num),
234
+ insertion_code=insertion_code)
235
+ seq_idx = int(atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id]
236
+ current = seq_to_structure_mappings.get(atom.author_chain_id, {})
237
+ current[seq_idx] = ResidueAtPosition(position=position,
238
+ name=atom.residue_name,
239
+ is_missing=False,
240
+ hetflag=hetflag)
241
+ seq_to_structure_mappings[atom.author_chain_id] = current
242
+
243
+ # Add missing residue information to seq_to_structure_mappings.
244
+ for chain_id, seq_info in valid_chains.items():
245
+ author_chain = mmcif_to_author_chain_id[chain_id]
246
+ current_mapping = seq_to_structure_mappings[author_chain]
247
+ for idx, monomer in enumerate(seq_info):
248
+ if idx not in current_mapping:
249
+ current_mapping[idx] = ResidueAtPosition(position=None,
250
+ name=monomer.id,
251
+ is_missing=True,
252
+ hetflag=' ')
253
+
254
+ author_chain_to_sequence = {}
255
+ for chain_id, seq_info in valid_chains.items():
256
+ author_chain = mmcif_to_author_chain_id[chain_id]
257
+ seq = []
258
+ for monomer in seq_info:
259
+ code = SCOPData.protein_letters_3to1.get(monomer.id, 'X')
260
+ seq.append(code if len(code) == 1 else 'X')
261
+ seq = ''.join(seq)
262
+ author_chain_to_sequence[author_chain] = seq
263
+
264
+ mmcif_object = MmcifObject(
265
+ file_id=file_id,
266
+ header=header,
267
+ structure=first_model_structure,
268
+ chain_to_seqres=author_chain_to_sequence,
269
+ seqres_to_structure=seq_to_structure_mappings,
270
+ raw_string=parsed_info)
271
+
272
+ return ParsingResult(mmcif_object=mmcif_object, errors=errors)
273
+ except Exception as e: # pylint:disable=broad-except
274
+ errors[(file_id, '')] = e
275
+ if not catch_all_errors:
276
+ raise
277
+ return ParsingResult(mmcif_object=None, errors=errors)
278
+
279
+
280
+ def _get_first_model(structure: PdbStructure) -> PdbStructure:
281
+ """Returns the first model in a Biopython structure."""
282
+ return next(structure.get_models())
283
+
284
+ _MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE = 21
285
+
286
+
287
+ def get_release_date(parsed_info: MmCIFDict) -> str:
288
+ """Returns the oldest revision date."""
289
+ revision_dates = parsed_info['_pdbx_audit_revision_history.revision_date']
290
+ return min(revision_dates)
291
+
292
+
293
+ def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
294
+ """Returns a basic header containing method, release date and resolution."""
295
+ header = {}
296
+
297
+ experiments = mmcif_loop_to_list('_exptl.', parsed_info)
298
+ header['structure_method'] = ','.join([
299
+ experiment['_exptl.method'].lower() for experiment in experiments])
300
+
301
+ # Note: The release_date here corresponds to the oldest revision. We prefer to
302
+ # use this for dataset filtering over the deposition_date.
303
+ if '_pdbx_audit_revision_history.revision_date' in parsed_info:
304
+ header['release_date'] = get_release_date(parsed_info)
305
+ else:
306
+ logging.warning('Could not determine release_date: %s',
307
+ parsed_info['_entry.id'])
308
+
309
+ header['resolution'] = 0.00
310
+ for res_key in ('_refine.ls_d_res_high', '_em_3d_reconstruction.resolution',
311
+ '_reflns.d_resolution_high'):
312
+ if res_key in parsed_info:
313
+ try:
314
+ raw_resolution = parsed_info[res_key][0]
315
+ header['resolution'] = float(raw_resolution)
316
+ except ValueError:
317
+ logging.warning('Invalid resolution format: %s', parsed_info[res_key])
318
+
319
+ return header
320
+
321
+
322
+ def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]:
323
+ """Returns list of atom sites; contains data not present in the structure."""
324
+ return [AtomSite(*site) for site in zip( # pylint:disable=g-complex-comprehension
325
+ parsed_info['_atom_site.label_comp_id'],
326
+ parsed_info['_atom_site.auth_asym_id'],
327
+ parsed_info['_atom_site.label_asym_id'],
328
+ parsed_info['_atom_site.auth_seq_id'],
329
+ parsed_info['_atom_site.label_seq_id'],
330
+ parsed_info['_atom_site.pdbx_PDB_ins_code'],
331
+ parsed_info['_atom_site.group_PDB'],
332
+ parsed_info['_atom_site.pdbx_PDB_model_num'],
333
+ )]
334
+
335
+
336
+ def _get_protein_chains(
337
+ *, parsed_info: Mapping[str, Any]) -> Mapping[ChainId, Sequence[Monomer]]:
338
+ """Extracts polymer information for protein chains only.
339
+
340
+ Args:
341
+ parsed_info: _mmcif_dict produced by the Biopython parser.
342
+
343
+ Returns:
344
+ A dict mapping mmcif chain id to a list of Monomers.
345
+ """
346
+ # Get polymer information for each entity in the structure.
347
+ entity_poly_seqs = mmcif_loop_to_list('_entity_poly_seq.', parsed_info)
348
+
349
+ polymers = collections.defaultdict(list)
350
+ for entity_poly_seq in entity_poly_seqs:
351
+ polymers[entity_poly_seq['_entity_poly_seq.entity_id']].append(
352
+ Monomer(id=entity_poly_seq['_entity_poly_seq.mon_id'],
353
+ num=int(entity_poly_seq['_entity_poly_seq.num'])))
354
+
355
+ # Get chemical compositions. Will allow us to identify which of these polymers
356
+ # are proteins.
357
+ chem_comps = mmcif_loop_to_dict('_chem_comp.', '_chem_comp.id', parsed_info)
358
+
359
+ # Get chains information for each entity. Necessary so that we can return a
360
+ # dict keyed on chain id rather than entity.
361
+ struct_asyms = mmcif_loop_to_list('_struct_asym.', parsed_info)
362
+
363
+ entity_to_mmcif_chains = collections.defaultdict(list)
364
+ for struct_asym in struct_asyms:
365
+ chain_id = struct_asym['_struct_asym.id']
366
+ entity_id = struct_asym['_struct_asym.entity_id']
367
+ entity_to_mmcif_chains[entity_id].append(chain_id)
368
+
369
+ # Identify and return the valid protein chains.
370
+ valid_chains = {}
371
+ for entity_id, seq_info in polymers.items():
372
+ chain_ids = entity_to_mmcif_chains[entity_id]
373
+
374
+ # Reject polymers without any peptide-like components, such as DNA/RNA.
375
+ if any(['peptide' in chem_comps[monomer.id]['_chem_comp.type']
376
+ for monomer in seq_info]):
377
+ for chain_id in chain_ids:
378
+ valid_chains[chain_id] = seq_info
379
+ return valid_chains
380
+
381
+
382
+ def _is_set(data: str) -> bool:
383
+ """Returns False if data is a special mmCIF character indicating 'unset'."""
384
+ return data not in ('.', '?')
alphafold/alphafold/data/parsers.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Functions for parsing various file formats."""
16
+ import collections
17
+ import dataclasses
18
+ import re
19
+ import string
20
+ from typing import Dict, Iterable, List, Optional, Sequence, Tuple
21
+
22
+ DeletionMatrix = Sequence[Sequence[int]]
23
+
24
+
25
+ @dataclasses.dataclass(frozen=True)
26
+ class TemplateHit:
27
+ """Class representing a template hit."""
28
+ index: int
29
+ name: str
30
+ aligned_cols: int
31
+ sum_probs: float
32
+ query: str
33
+ hit_sequence: str
34
+ indices_query: List[int]
35
+ indices_hit: List[int]
36
+
37
+
38
+ def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
39
+ """Parses FASTA string and returns list of strings with amino-acid sequences.
40
+
41
+ Arguments:
42
+ fasta_string: The string contents of a FASTA file.
43
+
44
+ Returns:
45
+ A tuple of two lists:
46
+ * A list of sequences.
47
+ * A list of sequence descriptions taken from the comment lines. In the
48
+ same order as the sequences.
49
+ """
50
+ sequences = []
51
+ descriptions = []
52
+ index = -1
53
+ for line in fasta_string.splitlines():
54
+ line = line.strip()
55
+ if line.startswith('>'):
56
+ index += 1
57
+ descriptions.append(line[1:]) # Remove the '>' at the beginning.
58
+ sequences.append('')
59
+ continue
60
+ elif not line:
61
+ continue # Skip blank lines.
62
+ sequences[index] += line
63
+
64
+ return sequences, descriptions
65
+
66
+
67
+ def parse_stockholm(
68
+ stockholm_string: str
69
+ ) -> Tuple[Sequence[str], DeletionMatrix, Sequence[str]]:
70
+ """Parses sequences and deletion matrix from stockholm format alignment.
71
+
72
+ Args:
73
+ stockholm_string: The string contents of a stockholm file. The first
74
+ sequence in the file should be the query sequence.
75
+
76
+ Returns:
77
+ A tuple of:
78
+ * A list of sequences that have been aligned to the query. These
79
+ might contain duplicates.
80
+ * The deletion matrix for the alignment as a list of lists. The element
81
+ at `deletion_matrix[i][j]` is the number of residues deleted from
82
+ the aligned sequence i at residue position j.
83
+ * The names of the targets matched, including the jackhmmer subsequence
84
+ suffix.
85
+ """
86
+ name_to_sequence = collections.OrderedDict()
87
+ for line in stockholm_string.splitlines():
88
+ line = line.strip()
89
+ if not line or line.startswith(('#', '//')):
90
+ continue
91
+ name, sequence = line.split()
92
+ if name not in name_to_sequence:
93
+ name_to_sequence[name] = ''
94
+ name_to_sequence[name] += sequence
95
+
96
+ msa = []
97
+ deletion_matrix = []
98
+
99
+ query = ''
100
+ keep_columns = []
101
+ for seq_index, sequence in enumerate(name_to_sequence.values()):
102
+ if seq_index == 0:
103
+ # Gather the columns with gaps from the query
104
+ query = sequence
105
+ keep_columns = [i for i, res in enumerate(query) if res != '-']
106
+
107
+ # Remove the columns with gaps in the query from all sequences.
108
+ aligned_sequence = ''.join([sequence[c] for c in keep_columns])
109
+
110
+ msa.append(aligned_sequence)
111
+
112
+ # Count the number of deletions w.r.t. query.
113
+ deletion_vec = []
114
+ deletion_count = 0
115
+ for seq_res, query_res in zip(sequence, query):
116
+ if seq_res != '-' or query_res != '-':
117
+ if query_res == '-':
118
+ deletion_count += 1
119
+ else:
120
+ deletion_vec.append(deletion_count)
121
+ deletion_count = 0
122
+ deletion_matrix.append(deletion_vec)
123
+
124
+ return msa, deletion_matrix, list(name_to_sequence.keys())
125
+
126
+
127
+ def parse_a3m(a3m_string: str) -> Tuple[Sequence[str], DeletionMatrix]:
128
+ """Parses sequences and deletion matrix from a3m format alignment.
129
+
130
+ Args:
131
+ a3m_string: The string contents of a a3m file. The first sequence in the
132
+ file should be the query sequence.
133
+
134
+ Returns:
135
+ A tuple of:
136
+ * A list of sequences that have been aligned to the query. These
137
+ might contain duplicates.
138
+ * The deletion matrix for the alignment as a list of lists. The element
139
+ at `deletion_matrix[i][j]` is the number of residues deleted from
140
+ the aligned sequence i at residue position j.
141
+ """
142
+ sequences, _ = parse_fasta(a3m_string)
143
+ deletion_matrix = []
144
+ for msa_sequence in sequences:
145
+ deletion_vec = []
146
+ deletion_count = 0
147
+ for j in msa_sequence:
148
+ if j.islower():
149
+ deletion_count += 1
150
+ else:
151
+ deletion_vec.append(deletion_count)
152
+ deletion_count = 0
153
+ deletion_matrix.append(deletion_vec)
154
+
155
+ # Make the MSA matrix out of aligned (deletion-free) sequences.
156
+ deletion_table = str.maketrans('', '', string.ascii_lowercase)
157
+ aligned_sequences = [s.translate(deletion_table) for s in sequences]
158
+ return aligned_sequences, deletion_matrix
159
+
160
+
161
+ def _convert_sto_seq_to_a3m(
162
+ query_non_gaps: Sequence[bool], sto_seq: str) -> Iterable[str]:
163
+ for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
164
+ if is_query_res_non_gap:
165
+ yield sequence_res
166
+ elif sequence_res != '-':
167
+ yield sequence_res.lower()
168
+
169
+
170
+ def convert_stockholm_to_a3m(stockholm_format: str,
171
+ max_sequences: Optional[int] = None) -> str:
172
+ """Converts MSA in Stockholm format to the A3M format."""
173
+ descriptions = {}
174
+ sequences = {}
175
+ reached_max_sequences = False
176
+
177
+ for line in stockholm_format.splitlines():
178
+ reached_max_sequences = max_sequences and len(sequences) >= max_sequences
179
+ if line.strip() and not line.startswith(('#', '//')):
180
+ # Ignore blank lines, markup and end symbols - remainder are alignment
181
+ # sequence parts.
182
+ seqname, aligned_seq = line.split(maxsplit=1)
183
+ if seqname not in sequences:
184
+ if reached_max_sequences:
185
+ continue
186
+ sequences[seqname] = ''
187
+ sequences[seqname] += aligned_seq
188
+
189
+ for line in stockholm_format.splitlines():
190
+ if line[:4] == '#=GS':
191
+ # Description row - example format is:
192
+ # #=GS UniRef90_Q9H5Z4/4-78 DE [subseq from] cDNA: FLJ22755 ...
193
+ columns = line.split(maxsplit=3)
194
+ seqname, feature = columns[1:3]
195
+ value = columns[3] if len(columns) == 4 else ''
196
+ if feature != 'DE':
197
+ continue
198
+ if reached_max_sequences and seqname not in sequences:
199
+ continue
200
+ descriptions[seqname] = value
201
+ if len(descriptions) == len(sequences):
202
+ break
203
+
204
+ # Convert sto format to a3m line by line
205
+ a3m_sequences = {}
206
+ # query_sequence is assumed to be the first sequence
207
+ query_sequence = next(iter(sequences.values()))
208
+ query_non_gaps = [res != '-' for res in query_sequence]
209
+ for seqname, sto_sequence in sequences.items():
210
+ a3m_sequences[seqname] = ''.join(
211
+ _convert_sto_seq_to_a3m(query_non_gaps, sto_sequence))
212
+
213
+ fasta_chunks = (f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
214
+ for k in a3m_sequences)
215
+ return '\n'.join(fasta_chunks) + '\n' # Include terminating newline.
216
+
217
+
218
+ def _get_hhr_line_regex_groups(
219
+ regex_pattern: str, line: str) -> Sequence[Optional[str]]:
220
+ match = re.match(regex_pattern, line)
221
+ if match is None:
222
+ raise RuntimeError(f'Could not parse query line {line}')
223
+ return match.groups()
224
+
225
+
226
+ def _update_hhr_residue_indices_list(
227
+ sequence: str, start_index: int, indices_list: List[int]):
228
+ """Computes the relative indices for each residue with respect to the original sequence."""
229
+ counter = start_index
230
+ for symbol in sequence:
231
+ if symbol == '-':
232
+ indices_list.append(-1)
233
+ else:
234
+ indices_list.append(counter)
235
+ counter += 1
236
+
237
+
238
+ def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
239
+ """Parses the detailed HMM HMM comparison section for a single Hit.
240
+
241
+ This works on .hhr files generated from both HHBlits and HHSearch.
242
+
243
+ Args:
244
+ detailed_lines: A list of lines from a single comparison section between 2
245
+ sequences (which each have their own HMM's)
246
+
247
+ Returns:
248
+ A dictionary with the information from that detailed comparison section
249
+
250
+ Raises:
251
+ RuntimeError: If a certain line cannot be processed
252
+ """
253
+ # Parse first 2 lines.
254
+ number_of_hit = int(detailed_lines[0].split()[-1])
255
+ name_hit = detailed_lines[1][1:]
256
+
257
+ # Parse the summary line.
258
+ pattern = (
259
+ 'Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t'
260
+ ' ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t '
261
+ ']*Template_Neff=(.*)')
262
+ match = re.match(pattern, detailed_lines[2])
263
+ if match is None:
264
+ raise RuntimeError(
265
+ 'Could not parse section: %s. Expected this: \n%s to contain summary.' %
266
+ (detailed_lines, detailed_lines[2]))
267
+ (prob_true, e_value, _, aligned_cols, _, _, sum_probs,
268
+ neff) = [float(x) for x in match.groups()]
269
+
270
+ # The next section reads the detailed comparisons. These are in a 'human
271
+ # readable' format which has a fixed length. The strategy employed is to
272
+ # assume that each block starts with the query sequence line, and to parse
273
+ # that with a regexp in order to deduce the fixed length used for that block.
274
+ query = ''
275
+ hit_sequence = ''
276
+ indices_query = []
277
+ indices_hit = []
278
+ length_block = None
279
+
280
+ for line in detailed_lines[3:]:
281
+ # Parse the query sequence line
282
+ if (line.startswith('Q ') and not line.startswith('Q ss_dssp') and
283
+ not line.startswith('Q ss_pred') and
284
+ not line.startswith('Q Consensus')):
285
+ # Thus the first 17 characters must be 'Q <query_name> ', and we can parse
286
+ # everything after that.
287
+ # start sequence end total_sequence_length
288
+ patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)'
289
+ groups = _get_hhr_line_regex_groups(patt, line[17:])
290
+
291
+ # Get the length of the parsed block using the start and finish indices,
292
+ # and ensure it is the same as the actual block length.
293
+ start = int(groups[0]) - 1 # Make index zero based.
294
+ delta_query = groups[1]
295
+ end = int(groups[2])
296
+ num_insertions = len([x for x in delta_query if x == '-'])
297
+ length_block = end - start + num_insertions
298
+ assert length_block == len(delta_query)
299
+
300
+ # Update the query sequence and indices list.
301
+ query += delta_query
302
+ _update_hhr_residue_indices_list(delta_query, start, indices_query)
303
+
304
+ elif line.startswith('T '):
305
+ # Parse the hit sequence.
306
+ if (not line.startswith('T ss_dssp') and
307
+ not line.startswith('T ss_pred') and
308
+ not line.startswith('T Consensus')):
309
+ # Thus the first 17 characters must be 'T <hit_name> ', and we can
310
+ # parse everything after that.
311
+ # start sequence end total_sequence_length
312
+ patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)'
313
+ groups = _get_hhr_line_regex_groups(patt, line[17:])
314
+ start = int(groups[0]) - 1 # Make index zero based.
315
+ delta_hit_sequence = groups[1]
316
+ assert length_block == len(delta_hit_sequence)
317
+
318
+ # Update the hit sequence and indices list.
319
+ hit_sequence += delta_hit_sequence
320
+ _update_hhr_residue_indices_list(delta_hit_sequence, start, indices_hit)
321
+
322
+ return TemplateHit(
323
+ index=number_of_hit,
324
+ name=name_hit,
325
+ aligned_cols=int(aligned_cols),
326
+ sum_probs=sum_probs,
327
+ query=query,
328
+ hit_sequence=hit_sequence,
329
+ indices_query=indices_query,
330
+ indices_hit=indices_hit,
331
+ )
332
+
333
+
334
+ def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
335
+ """Parses the content of an entire HHR file."""
336
+ lines = hhr_string.splitlines()
337
+
338
+ # Each .hhr file starts with a results table, then has a sequence of hit
339
+ # "paragraphs", each paragraph starting with a line 'No <hit number>'. We
340
+ # iterate through each paragraph to parse each hit.
341
+
342
+ block_starts = [i for i, line in enumerate(lines) if line.startswith('No ')]
343
+
344
+ hits = []
345
+ if block_starts:
346
+ block_starts.append(len(lines)) # Add the end of the final block.
347
+ for i in range(len(block_starts) - 1):
348
+ hits.append(_parse_hhr_hit(lines[block_starts[i]:block_starts[i + 1]]))
349
+ return hits
350
+
351
+
352
+ def parse_e_values_from_tblout(tblout: str) -> Dict[str, float]:
353
+ """Parse target to e-value mapping parsed from Jackhmmer tblout string."""
354
+ e_values = {'query': 0}
355
+ lines = [line for line in tblout.splitlines() if line[0] != '#']
356
+ # As per http://eddylab.org/software/hmmer/Userguide.pdf fields are
357
+ # space-delimited. Relevant fields are (1) target name: and
358
+ # (5) E-value (full sequence) (numbering from 1).
359
+ for line in lines:
360
+ fields = line.split()
361
+ e_value = fields[4]
362
+ target_name = fields[0]
363
+ e_values[target_name] = float(e_value)
364
+ return e_values
alphafold/alphafold/data/pipeline.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Functions for building the input features for the AlphaFold model."""
16
+
17
+ import os
18
+ from typing import Mapping, Optional, Sequence
19
+ from absl import logging
20
+ from alphafold.common import residue_constants
21
+ from alphafold.data import parsers
22
+ from alphafold.data import templates
23
+ from alphafold.data.tools import hhblits
24
+ from alphafold.data.tools import hhsearch
25
+ from alphafold.data.tools import jackhmmer
26
+ import numpy as np
27
+
28
+ # Internal import (7716).
29
+
30
+ FeatureDict = Mapping[str, np.ndarray]
31
+
32
+
33
+ def make_sequence_features(
34
+ sequence: str, description: str, num_res: int) -> FeatureDict:
35
+ """Constructs a feature dict of sequence features."""
36
+ features = {}
37
+ features['aatype'] = residue_constants.sequence_to_onehot(
38
+ sequence=sequence,
39
+ mapping=residue_constants.restype_order_with_x,
40
+ map_unknown_to_x=True)
41
+ features['between_segment_residues'] = np.zeros((num_res,), dtype=np.int32)
42
+ features['domain_name'] = np.array([description.encode('utf-8')],
43
+ dtype=np.object_)
44
+ features['residue_index'] = np.array(range(num_res), dtype=np.int32)
45
+ features['seq_length'] = np.array([num_res] * num_res, dtype=np.int32)
46
+ features['sequence'] = np.array([sequence.encode('utf-8')], dtype=np.object_)
47
+ return features
48
+
49
+
50
+ def make_msa_features(
51
+ msas: Sequence[Sequence[str]],
52
+ deletion_matrices: Sequence[parsers.DeletionMatrix]) -> FeatureDict:
53
+ """Constructs a feature dict of MSA features."""
54
+ if not msas:
55
+ raise ValueError('At least one MSA must be provided.')
56
+
57
+ int_msa = []
58
+ deletion_matrix = []
59
+ seen_sequences = set()
60
+ for msa_index, msa in enumerate(msas):
61
+ if not msa:
62
+ raise ValueError(f'MSA {msa_index} must contain at least one sequence.')
63
+ for sequence_index, sequence in enumerate(msa):
64
+ if sequence in seen_sequences:
65
+ continue
66
+ seen_sequences.add(sequence)
67
+ int_msa.append(
68
+ [residue_constants.HHBLITS_AA_TO_ID[res] for res in sequence])
69
+ deletion_matrix.append(deletion_matrices[msa_index][sequence_index])
70
+
71
+ num_res = len(msas[0][0])
72
+ num_alignments = len(int_msa)
73
+ features = {}
74
+ features['deletion_matrix_int'] = np.array(deletion_matrix, dtype=np.int32)
75
+ features['msa'] = np.array(int_msa, dtype=np.int32)
76
+ features['num_alignments'] = np.array(
77
+ [num_alignments] * num_res, dtype=np.int32)
78
+ return features
79
+
80
+
81
+ class DataPipeline:
82
+ """Runs the alignment tools and assembles the input features."""
83
+
84
+ def __init__(self,
85
+ jackhmmer_binary_path: str,
86
+ hhblits_binary_path: str,
87
+ hhsearch_binary_path: str,
88
+ uniref90_database_path: str,
89
+ mgnify_database_path: str,
90
+ bfd_database_path: Optional[str],
91
+ uniclust30_database_path: Optional[str],
92
+ small_bfd_database_path: Optional[str],
93
+ pdb70_database_path: str,
94
+ template_featurizer: templates.TemplateHitFeaturizer,
95
+ use_small_bfd: bool,
96
+ mgnify_max_hits: int = 501,
97
+ uniref_max_hits: int = 10000):
98
+ """Constructs a feature dict for a given FASTA file."""
99
+ self._use_small_bfd = use_small_bfd
100
+ self.jackhmmer_uniref90_runner = jackhmmer.Jackhmmer(
101
+ binary_path=jackhmmer_binary_path,
102
+ database_path=uniref90_database_path)
103
+ if use_small_bfd:
104
+ self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
105
+ binary_path=jackhmmer_binary_path,
106
+ database_path=small_bfd_database_path)
107
+ else:
108
+ self.hhblits_bfd_uniclust_runner = hhblits.HHBlits(
109
+ binary_path=hhblits_binary_path,
110
+ databases=[bfd_database_path, uniclust30_database_path])
111
+ self.jackhmmer_mgnify_runner = jackhmmer.Jackhmmer(
112
+ binary_path=jackhmmer_binary_path,
113
+ database_path=mgnify_database_path)
114
+ self.hhsearch_pdb70_runner = hhsearch.HHSearch(
115
+ binary_path=hhsearch_binary_path,
116
+ databases=[pdb70_database_path])
117
+ self.template_featurizer = template_featurizer
118
+ self.mgnify_max_hits = mgnify_max_hits
119
+ self.uniref_max_hits = uniref_max_hits
120
+
121
+ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
122
+ """Runs alignment tools on the input sequence and creates features."""
123
+ with open(input_fasta_path) as f:
124
+ input_fasta_str = f.read()
125
+ input_seqs, input_descs = parsers.parse_fasta(input_fasta_str)
126
+ if len(input_seqs) != 1:
127
+ raise ValueError(
128
+ f'More than one input sequence found in {input_fasta_path}.')
129
+ input_sequence = input_seqs[0]
130
+ input_description = input_descs[0]
131
+ num_res = len(input_sequence)
132
+
133
+ jackhmmer_uniref90_result = self.jackhmmer_uniref90_runner.query(
134
+ input_fasta_path)[0]
135
+ jackhmmer_mgnify_result = self.jackhmmer_mgnify_runner.query(
136
+ input_fasta_path)[0]
137
+
138
+ uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(
139
+ jackhmmer_uniref90_result['sto'], max_sequences=self.uniref_max_hits)
140
+ hhsearch_result = self.hhsearch_pdb70_runner.query(uniref90_msa_as_a3m)
141
+
142
+ uniref90_out_path = os.path.join(msa_output_dir, 'uniref90_hits.sto')
143
+ with open(uniref90_out_path, 'w') as f:
144
+ f.write(jackhmmer_uniref90_result['sto'])
145
+
146
+ mgnify_out_path = os.path.join(msa_output_dir, 'mgnify_hits.sto')
147
+ with open(mgnify_out_path, 'w') as f:
148
+ f.write(jackhmmer_mgnify_result['sto'])
149
+
150
+ pdb70_out_path = os.path.join(msa_output_dir, 'pdb70_hits.hhr')
151
+ with open(pdb70_out_path, 'w') as f:
152
+ f.write(hhsearch_result)
153
+
154
+ uniref90_msa, uniref90_deletion_matrix, _ = parsers.parse_stockholm(
155
+ jackhmmer_uniref90_result['sto'])
156
+ mgnify_msa, mgnify_deletion_matrix, _ = parsers.parse_stockholm(
157
+ jackhmmer_mgnify_result['sto'])
158
+ hhsearch_hits = parsers.parse_hhr(hhsearch_result)
159
+ mgnify_msa = mgnify_msa[:self.mgnify_max_hits]
160
+ mgnify_deletion_matrix = mgnify_deletion_matrix[:self.mgnify_max_hits]
161
+
162
+ if self._use_small_bfd:
163
+ jackhmmer_small_bfd_result = self.jackhmmer_small_bfd_runner.query(
164
+ input_fasta_path)[0]
165
+
166
+ bfd_out_path = os.path.join(msa_output_dir, 'small_bfd_hits.a3m')
167
+ with open(bfd_out_path, 'w') as f:
168
+ f.write(jackhmmer_small_bfd_result['sto'])
169
+
170
+ bfd_msa, bfd_deletion_matrix, _ = parsers.parse_stockholm(
171
+ jackhmmer_small_bfd_result['sto'])
172
+ else:
173
+ hhblits_bfd_uniclust_result = self.hhblits_bfd_uniclust_runner.query(
174
+ input_fasta_path)
175
+
176
+ bfd_out_path = os.path.join(msa_output_dir, 'bfd_uniclust_hits.a3m')
177
+ with open(bfd_out_path, 'w') as f:
178
+ f.write(hhblits_bfd_uniclust_result['a3m'])
179
+
180
+ bfd_msa, bfd_deletion_matrix = parsers.parse_a3m(
181
+ hhblits_bfd_uniclust_result['a3m'])
182
+
183
+ templates_result = self.template_featurizer.get_templates(
184
+ query_sequence=input_sequence,
185
+ query_pdb_code=None,
186
+ query_release_date=None,
187
+ hits=hhsearch_hits)
188
+
189
+ sequence_features = make_sequence_features(
190
+ sequence=input_sequence,
191
+ description=input_description,
192
+ num_res=num_res)
193
+
194
+ msa_features = make_msa_features(
195
+ msas=(uniref90_msa, bfd_msa, mgnify_msa),
196
+ deletion_matrices=(uniref90_deletion_matrix,
197
+ bfd_deletion_matrix,
198
+ mgnify_deletion_matrix))
199
+
200
+ logging.info('Uniref90 MSA size: %d sequences.', len(uniref90_msa))
201
+ logging.info('BFD MSA size: %d sequences.', len(bfd_msa))
202
+ logging.info('MGnify MSA size: %d sequences.', len(mgnify_msa))
203
+ logging.info('Final (deduplicated) MSA size: %d sequences.',
204
+ msa_features['num_alignments'][0])
205
+ logging.info('Total number of templates (NB: this can include bad '
206
+ 'templates and is later filtered to top 4): %d.',
207
+ templates_result.features['template_domain_names'].shape[0])
208
+
209
+ return {**sequence_features, **msa_features, **templates_result.features}
alphafold/alphafold/data/templates.py ADDED
@@ -0,0 +1,922 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Functions for getting templates and calculating template features."""
16
+ import dataclasses
17
+ import datetime
18
+ import glob
19
+ import os
20
+ import re
21
+ from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
22
+
23
+ from absl import logging
24
+ from alphafold.common import residue_constants
25
+ from alphafold.data import mmcif_parsing
26
+ from alphafold.data import parsers
27
+ from alphafold.data.tools import kalign
28
+ import numpy as np
29
+
30
+ # Internal import (7716).
31
+
32
+
33
+ class Error(Exception):
34
+ """Base class for exceptions."""
35
+
36
+
37
+ class NoChainsError(Error):
38
+ """An error indicating that template mmCIF didn't have any chains."""
39
+
40
+
41
+ class SequenceNotInTemplateError(Error):
42
+ """An error indicating that template mmCIF didn't contain the sequence."""
43
+
44
+
45
+ class NoAtomDataInTemplateError(Error):
46
+ """An error indicating that template mmCIF didn't contain atom positions."""
47
+
48
+
49
+ class TemplateAtomMaskAllZerosError(Error):
50
+ """An error indicating that template mmCIF had all atom positions masked."""
51
+
52
+
53
+ class QueryToTemplateAlignError(Error):
54
+ """An error indicating that the query can't be aligned to the template."""
55
+
56
+
57
+ class CaDistanceError(Error):
58
+ """An error indicating that a CA atom distance exceeds a threshold."""
59
+
60
+
61
+ class MultipleChainsError(Error):
62
+ """An error indicating that multiple chains were found for a given ID."""
63
+
64
+
65
+ # Prefilter exceptions.
66
+ class PrefilterError(Exception):
67
+ """A base class for template prefilter exceptions."""
68
+
69
+
70
+ class DateError(PrefilterError):
71
+ """An error indicating that the hit date was after the max allowed date."""
72
+
73
+
74
+ class PdbIdError(PrefilterError):
75
+ """An error indicating that the hit PDB ID was identical to the query."""
76
+
77
+
78
+ class AlignRatioError(PrefilterError):
79
+ """An error indicating that the hit align ratio to the query was too small."""
80
+
81
+
82
+ class DuplicateError(PrefilterError):
83
+ """An error indicating that the hit was an exact subsequence of the query."""
84
+
85
+
86
+ class LengthError(PrefilterError):
87
+ """An error indicating that the hit was too short."""
88
+
89
+
90
+ TEMPLATE_FEATURES = {
91
+ 'template_aatype': np.float32,
92
+ 'template_all_atom_masks': np.float32,
93
+ 'template_all_atom_positions': np.float32,
94
+ 'template_domain_names': np.object,
95
+ 'template_sequence': np.object,
96
+ 'template_sum_probs': np.float32,
97
+ }
98
+
99
+
100
+ def _get_pdb_id_and_chain(hit: parsers.TemplateHit) -> Tuple[str, str]:
101
+ """Returns PDB id and chain id for an HHSearch Hit."""
102
+ # PDB ID: 4 letters. Chain ID: 1+ alphanumeric letters or "." if unknown.
103
+ id_match = re.match(r'[a-zA-Z\d]{4}_[a-zA-Z0-9.]+', hit.name)
104
+ if not id_match:
105
+ raise ValueError(f'hit.name did not start with PDBID_chain: {hit.name}')
106
+ pdb_id, chain_id = id_match.group(0).split('_')
107
+ return pdb_id.lower(), chain_id
108
+
109
+
110
+ def _is_after_cutoff(
111
+ pdb_id: str,
112
+ release_dates: Mapping[str, datetime.datetime],
113
+ release_date_cutoff: Optional[datetime.datetime]) -> bool:
114
+ """Checks if the template date is after the release date cutoff.
115
+
116
+ Args:
117
+ pdb_id: 4 letter pdb code.
118
+ release_dates: Dictionary mapping PDB ids to their structure release dates.
119
+ release_date_cutoff: Max release date that is valid for this query.
120
+
121
+ Returns:
122
+ True if the template release date is after the cutoff, False otherwise.
123
+ """
124
+ if release_date_cutoff is None:
125
+ raise ValueError('The release_date_cutoff must not be None.')
126
+ if pdb_id in release_dates:
127
+ return release_dates[pdb_id] > release_date_cutoff
128
+ else:
129
+ # Since this is just a quick prefilter to reduce the number of mmCIF files
130
+ # we need to parse, we don't have to worry about returning True here.
131
+ logging.warning('Template structure not in release dates dict: %s', pdb_id)
132
+ return False
133
+
134
+
135
+ def _parse_obsolete(obsolete_file_path: str) -> Mapping[str, Optional[str]]:
136
+ """Parses the data file from PDB that lists which pdb_ids are obsolete."""
137
+ with open(obsolete_file_path) as f:
138
+ result = {}
139
+ for line in f:
140
+ line = line.strip()
141
+ # Format: Date From To
142
+ # 'OBSLTE 06-NOV-19 6G9Y' - Removed, rare
143
+ # 'OBSLTE 31-JUL-94 116L 216L' - Replaced, common
144
+ # 'OBSLTE 26-SEP-06 2H33 2JM5 2OWI' - Replaced by multiple, rare
145
+ if line.startswith('OBSLTE'):
146
+ if len(line) > 30:
147
+ # Replaced by at least one structure.
148
+ from_id = line[20:24].lower()
149
+ to_id = line[29:33].lower()
150
+ result[from_id] = to_id
151
+ elif len(line) == 24:
152
+ # Removed.
153
+ from_id = line[20:24].lower()
154
+ result[from_id] = None
155
+ return result
156
+
157
+
158
+ def _parse_release_dates(path: str) -> Mapping[str, datetime.datetime]:
159
+ """Parses release dates file, returns a mapping from PDBs to release dates."""
160
+ if path.endswith('txt'):
161
+ release_dates = {}
162
+ with open(path, 'r') as f:
163
+ for line in f:
164
+ pdb_id, date = line.split(':')
165
+ date = date.strip()
166
+ # Python 3.6 doesn't have datetime.date.fromisoformat() which is about
167
+ # 90x faster than strptime. However, splitting the string manually is
168
+ # about 10x faster than strptime.
169
+ release_dates[pdb_id.strip()] = datetime.datetime(
170
+ year=int(date[:4]), month=int(date[5:7]), day=int(date[8:10]))
171
+ return release_dates
172
+ else:
173
+ raise ValueError('Invalid format of the release date file %s.' % path)
174
+
175
+
176
+ def _assess_hhsearch_hit(
177
+ hit: parsers.TemplateHit,
178
+ hit_pdb_code: str,
179
+ query_sequence: str,
180
+ query_pdb_code: Optional[str],
181
+ release_dates: Mapping[str, datetime.datetime],
182
+ release_date_cutoff: datetime.datetime,
183
+ max_subsequence_ratio: float = 0.95,
184
+ min_align_ratio: float = 0.1) -> bool:
185
+ """Determines if template is valid (without parsing the template mmcif file).
186
+
187
+ Args:
188
+ hit: HhrHit for the template.
189
+ hit_pdb_code: The 4 letter pdb code of the template hit. This might be
190
+ different from the value in the actual hit since the original pdb might
191
+ have become obsolete.
192
+ query_sequence: Amino acid sequence of the query.
193
+ query_pdb_code: 4 letter pdb code of the query.
194
+ release_dates: Dictionary mapping pdb codes to their structure release
195
+ dates.
196
+ release_date_cutoff: Max release date that is valid for this query.
197
+ max_subsequence_ratio: Exclude any exact matches with this much overlap.
198
+ min_align_ratio: Minimum overlap between the template and query.
199
+
200
+ Returns:
201
+ True if the hit passed the prefilter. Raises an exception otherwise.
202
+
203
+ Raises:
204
+ DateError: If the hit date was after the max allowed date.
205
+ PdbIdError: If the hit PDB ID was identical to the query.
206
+ AlignRatioError: If the hit align ratio to the query was too small.
207
+ DuplicateError: If the hit was an exact subsequence of the query.
208
+ LengthError: If the hit was too short.
209
+ """
210
+ aligned_cols = hit.aligned_cols
211
+ align_ratio = aligned_cols / len(query_sequence)
212
+
213
+ template_sequence = hit.hit_sequence.replace('-', '')
214
+ length_ratio = float(len(template_sequence)) / len(query_sequence)
215
+
216
+ # Check whether the template is a large subsequence or duplicate of original
217
+ # query. This can happen due to duplicate entries in the PDB database.
218
+ duplicate = (template_sequence in query_sequence and
219
+ length_ratio > max_subsequence_ratio)
220
+
221
+ if _is_after_cutoff(hit_pdb_code, release_dates, release_date_cutoff):
222
+ raise DateError(f'Date ({release_dates[hit_pdb_code]}) > max template date '
223
+ f'({release_date_cutoff}).')
224
+
225
+ if query_pdb_code is not None:
226
+ if query_pdb_code.lower() == hit_pdb_code.lower():
227
+ raise PdbIdError('PDB code identical to Query PDB code.')
228
+
229
+ if align_ratio <= min_align_ratio:
230
+ raise AlignRatioError('Proportion of residues aligned to query too small. '
231
+ f'Align ratio: {align_ratio}.')
232
+
233
+ if duplicate:
234
+ raise DuplicateError('Template is an exact subsequence of query with large '
235
+ f'coverage. Length ratio: {length_ratio}.')
236
+
237
+ if len(template_sequence) < 10:
238
+ raise LengthError(f'Template too short. Length: {len(template_sequence)}.')
239
+
240
+ return True
241
+
242
+
243
+ def _find_template_in_pdb(
244
+ template_chain_id: str,
245
+ template_sequence: str,
246
+ mmcif_object: mmcif_parsing.MmcifObject) -> Tuple[str, str, int]:
247
+ """Tries to find the template chain in the given pdb file.
248
+
249
+ This method tries the three following things in order:
250
+ 1. Tries if there is an exact match in both the chain ID and the sequence.
251
+ If yes, the chain sequence is returned. Otherwise:
252
+ 2. Tries if there is an exact match only in the sequence.
253
+ If yes, the chain sequence is returned. Otherwise:
254
+ 3. Tries if there is a fuzzy match (X = wildcard) in the sequence.
255
+ If yes, the chain sequence is returned.
256
+ If none of these succeed, a SequenceNotInTemplateError is thrown.
257
+
258
+ Args:
259
+ template_chain_id: The template chain ID.
260
+ template_sequence: The template chain sequence.
261
+ mmcif_object: The PDB object to search for the template in.
262
+
263
+ Returns:
264
+ A tuple with:
265
+ * The chain sequence that was found to match the template in the PDB object.
266
+ * The ID of the chain that is being returned.
267
+ * The offset where the template sequence starts in the chain sequence.
268
+
269
+ Raises:
270
+ SequenceNotInTemplateError: If no match is found after the steps described
271
+ above.
272
+ """
273
+ # Try if there is an exact match in both the chain ID and the (sub)sequence.
274
+ pdb_id = mmcif_object.file_id
275
+ chain_sequence = mmcif_object.chain_to_seqres.get(template_chain_id)
276
+ if chain_sequence and (template_sequence in chain_sequence):
277
+ logging.info(
278
+ 'Found an exact template match %s_%s.', pdb_id, template_chain_id)
279
+ mapping_offset = chain_sequence.find(template_sequence)
280
+ return chain_sequence, template_chain_id, mapping_offset
281
+
282
+ # Try if there is an exact match in the (sub)sequence only.
283
+ for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
284
+ if chain_sequence and (template_sequence in chain_sequence):
285
+ logging.info('Found a sequence-only match %s_%s.', pdb_id, chain_id)
286
+ mapping_offset = chain_sequence.find(template_sequence)
287
+ return chain_sequence, chain_id, mapping_offset
288
+
289
+ # Return a chain sequence that fuzzy matches (X = wildcard) the template.
290
+ # Make parentheses unnamed groups (?:_) to avoid the 100 named groups limit.
291
+ regex = ['.' if aa == 'X' else '(?:%s|X)' % aa for aa in template_sequence]
292
+ regex = re.compile(''.join(regex))
293
+ for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
294
+ match = re.search(regex, chain_sequence)
295
+ if match:
296
+ logging.info('Found a fuzzy sequence-only match %s_%s.', pdb_id, chain_id)
297
+ mapping_offset = match.start()
298
+ return chain_sequence, chain_id, mapping_offset
299
+
300
+ # No hits, raise an error.
301
+ raise SequenceNotInTemplateError(
302
+ 'Could not find the template sequence in %s_%s. Template sequence: %s, '
303
+ 'chain_to_seqres: %s' % (pdb_id, template_chain_id, template_sequence,
304
+ mmcif_object.chain_to_seqres))
305
+
306
+
307
+ def _realign_pdb_template_to_query(
308
+ old_template_sequence: str,
309
+ template_chain_id: str,
310
+ mmcif_object: mmcif_parsing.MmcifObject,
311
+ old_mapping: Mapping[int, int],
312
+ kalign_binary_path: str) -> Tuple[str, Mapping[int, int]]:
313
+ """Aligns template from the mmcif_object to the query.
314
+
315
+ In case PDB70 contains a different version of the template sequence, we need
316
+ to perform a realignment to the actual sequence that is in the mmCIF file.
317
+ This method performs such realignment, but returns the new sequence and
318
+ mapping only if the sequence in the mmCIF file is 90% identical to the old
319
+ sequence.
320
+
321
+ Note that the old_template_sequence comes from the hit, and contains only that
322
+ part of the chain that matches with the query while the new_template_sequence
323
+ is the full chain.
324
+
325
+ Args:
326
+ old_template_sequence: The template sequence that was returned by the PDB
327
+ template search (typically done using HHSearch).
328
+ template_chain_id: The template chain id was returned by the PDB template
329
+ search (typically done using HHSearch). This is used to find the right
330
+ chain in the mmcif_object chain_to_seqres mapping.
331
+ mmcif_object: A mmcif_object which holds the actual template data.
332
+ old_mapping: A mapping from the query sequence to the template sequence.
333
+ This mapping will be used to compute the new mapping from the query
334
+ sequence to the actual mmcif_object template sequence by aligning the
335
+ old_template_sequence and the actual template sequence.
336
+ kalign_binary_path: The path to a kalign executable.
337
+
338
+ Returns:
339
+ A tuple (new_template_sequence, new_query_to_template_mapping) where:
340
+ * new_template_sequence is the actual template sequence that was found in
341
+ the mmcif_object.
342
+ * new_query_to_template_mapping is the new mapping from the query to the
343
+ actual template found in the mmcif_object.
344
+
345
+ Raises:
346
+ QueryToTemplateAlignError:
347
+ * If there was an error thrown by the alignment tool.
348
+ * Or if the actual template sequence differs by more than 10% from the
349
+ old_template_sequence.
350
+ """
351
+ aligner = kalign.Kalign(binary_path=kalign_binary_path)
352
+ new_template_sequence = mmcif_object.chain_to_seqres.get(
353
+ template_chain_id, '')
354
+
355
+ # Sometimes the template chain id is unknown. But if there is only a single
356
+ # sequence within the mmcif_object, it is safe to assume it is that one.
357
+ if not new_template_sequence:
358
+ if len(mmcif_object.chain_to_seqres) == 1:
359
+ logging.info('Could not find %s in %s, but there is only 1 sequence, so '
360
+ 'using that one.',
361
+ template_chain_id,
362
+ mmcif_object.file_id)
363
+ new_template_sequence = list(mmcif_object.chain_to_seqres.values())[0]
364
+ else:
365
+ raise QueryToTemplateAlignError(
366
+ f'Could not find chain {template_chain_id} in {mmcif_object.file_id}. '
367
+ 'If there are no mmCIF parsing errors, it is possible it was not a '
368
+ 'protein chain.')
369
+
370
+ try:
371
+ (old_aligned_template, new_aligned_template), _ = parsers.parse_a3m(
372
+ aligner.align([old_template_sequence, new_template_sequence]))
373
+ except Exception as e:
374
+ raise QueryToTemplateAlignError(
375
+ 'Could not align old template %s to template %s (%s_%s). Error: %s' %
376
+ (old_template_sequence, new_template_sequence, mmcif_object.file_id,
377
+ template_chain_id, str(e)))
378
+
379
+ logging.info('Old aligned template: %s\nNew aligned template: %s',
380
+ old_aligned_template, new_aligned_template)
381
+
382
+ old_to_new_template_mapping = {}
383
+ old_template_index = -1
384
+ new_template_index = -1
385
+ num_same = 0
386
+ for old_template_aa, new_template_aa in zip(
387
+ old_aligned_template, new_aligned_template):
388
+ if old_template_aa != '-':
389
+ old_template_index += 1
390
+ if new_template_aa != '-':
391
+ new_template_index += 1
392
+ if old_template_aa != '-' and new_template_aa != '-':
393
+ old_to_new_template_mapping[old_template_index] = new_template_index
394
+ if old_template_aa == new_template_aa:
395
+ num_same += 1
396
+
397
+ # Require at least 90 % sequence identity wrt to the shorter of the sequences.
398
+ if float(num_same) / min(
399
+ len(old_template_sequence), len(new_template_sequence)) < 0.9:
400
+ raise QueryToTemplateAlignError(
401
+ 'Insufficient similarity of the sequence in the database: %s to the '
402
+ 'actual sequence in the mmCIF file %s_%s: %s. We require at least '
403
+ '90 %% similarity wrt to the shorter of the sequences. This is not a '
404
+ 'problem unless you think this is a template that should be included.' %
405
+ (old_template_sequence, mmcif_object.file_id, template_chain_id,
406
+ new_template_sequence))
407
+
408
+ new_query_to_template_mapping = {}
409
+ for query_index, old_template_index in old_mapping.items():
410
+ new_query_to_template_mapping[query_index] = (
411
+ old_to_new_template_mapping.get(old_template_index, -1))
412
+
413
+ new_template_sequence = new_template_sequence.replace('-', '')
414
+
415
+ return new_template_sequence, new_query_to_template_mapping
416
+
417
+
418
+ def _check_residue_distances(all_positions: np.ndarray,
419
+ all_positions_mask: np.ndarray,
420
+ max_ca_ca_distance: float):
421
+ """Checks if the distance between unmasked neighbor residues is ok."""
422
+ ca_position = residue_constants.atom_order['CA']
423
+ prev_is_unmasked = False
424
+ prev_calpha = None
425
+ for i, (coords, mask) in enumerate(zip(all_positions, all_positions_mask)):
426
+ this_is_unmasked = bool(mask[ca_position])
427
+ if this_is_unmasked:
428
+ this_calpha = coords[ca_position]
429
+ if prev_is_unmasked:
430
+ distance = np.linalg.norm(this_calpha - prev_calpha)
431
+ if distance > max_ca_ca_distance:
432
+ raise CaDistanceError(
433
+ 'The distance between residues %d and %d is %f > limit %f.' % (
434
+ i, i + 1, distance, max_ca_ca_distance))
435
+ prev_calpha = this_calpha
436
+ prev_is_unmasked = this_is_unmasked
437
+
438
+
439
+ def _get_atom_positions(
440
+ mmcif_object: mmcif_parsing.MmcifObject,
441
+ auth_chain_id: str,
442
+ max_ca_ca_distance: float) -> Tuple[np.ndarray, np.ndarray]:
443
+ """Gets atom positions and mask from a list of Biopython Residues."""
444
+ num_res = len(mmcif_object.chain_to_seqres[auth_chain_id])
445
+
446
+ relevant_chains = [c for c in mmcif_object.structure.get_chains()
447
+ if c.id == auth_chain_id]
448
+ if len(relevant_chains) != 1:
449
+ raise MultipleChainsError(
450
+ f'Expected exactly one chain in structure with id {auth_chain_id}.')
451
+ chain = relevant_chains[0]
452
+
453
+ all_positions = np.zeros([num_res, residue_constants.atom_type_num, 3])
454
+ all_positions_mask = np.zeros([num_res, residue_constants.atom_type_num],
455
+ dtype=np.int64)
456
+ for res_index in range(num_res):
457
+ pos = np.zeros([residue_constants.atom_type_num, 3], dtype=np.float32)
458
+ mask = np.zeros([residue_constants.atom_type_num], dtype=np.float32)
459
+ res_at_position = mmcif_object.seqres_to_structure[auth_chain_id][res_index]
460
+ if not res_at_position.is_missing:
461
+ res = chain[(res_at_position.hetflag,
462
+ res_at_position.position.residue_number,
463
+ res_at_position.position.insertion_code)]
464
+ for atom in res.get_atoms():
465
+ atom_name = atom.get_name()
466
+ x, y, z = atom.get_coord()
467
+ if atom_name in residue_constants.atom_order.keys():
468
+ pos[residue_constants.atom_order[atom_name]] = [x, y, z]
469
+ mask[residue_constants.atom_order[atom_name]] = 1.0
470
+ elif atom_name.upper() == 'SE' and res.get_resname() == 'MSE':
471
+ # Put the coordinates of the selenium atom in the sulphur column.
472
+ pos[residue_constants.atom_order['SD']] = [x, y, z]
473
+ mask[residue_constants.atom_order['SD']] = 1.0
474
+
475
+ all_positions[res_index] = pos
476
+ all_positions_mask[res_index] = mask
477
+ _check_residue_distances(
478
+ all_positions, all_positions_mask, max_ca_ca_distance)
479
+ return all_positions, all_positions_mask
480
+
481
+
482
+ def _extract_template_features(
483
+ mmcif_object: mmcif_parsing.MmcifObject,
484
+ pdb_id: str,
485
+ mapping: Mapping[int, int],
486
+ template_sequence: str,
487
+ query_sequence: str,
488
+ template_chain_id: str,
489
+ kalign_binary_path: str) -> Tuple[Dict[str, Any], Optional[str]]:
490
+ """Parses atom positions in the target structure and aligns with the query.
491
+
492
+ Atoms for each residue in the template structure are indexed to coincide
493
+ with their corresponding residue in the query sequence, according to the
494
+ alignment mapping provided.
495
+
496
+ Args:
497
+ mmcif_object: mmcif_parsing.MmcifObject representing the template.
498
+ pdb_id: PDB code for the template.
499
+ mapping: Dictionary mapping indices in the query sequence to indices in
500
+ the template sequence.
501
+ template_sequence: String describing the amino acid sequence for the
502
+ template protein.
503
+ query_sequence: String describing the amino acid sequence for the query
504
+ protein.
505
+ template_chain_id: String ID describing which chain in the structure proto
506
+ should be used.
507
+ kalign_binary_path: The path to a kalign executable used for template
508
+ realignment.
509
+
510
+ Returns:
511
+ A tuple with:
512
+ * A dictionary containing the extra features derived from the template
513
+ protein structure.
514
+ * A warning message if the hit was realigned to the actual mmCIF sequence.
515
+ Otherwise None.
516
+
517
+ Raises:
518
+ NoChainsError: If the mmcif object doesn't contain any chains.
519
+ SequenceNotInTemplateError: If the given chain id / sequence can't
520
+ be found in the mmcif object.
521
+ QueryToTemplateAlignError: If the actual template in the mmCIF file
522
+ can't be aligned to the query.
523
+ NoAtomDataInTemplateError: If the mmcif object doesn't contain
524
+ atom positions.
525
+ TemplateAtomMaskAllZerosError: If the mmcif object doesn't have any
526
+ unmasked residues.
527
+ """
528
+ if mmcif_object is None or not mmcif_object.chain_to_seqres:
529
+ raise NoChainsError('No chains in PDB: %s_%s' % (pdb_id, template_chain_id))
530
+
531
+ warning = None
532
+ try:
533
+ seqres, chain_id, mapping_offset = _find_template_in_pdb(
534
+ template_chain_id=template_chain_id,
535
+ template_sequence=template_sequence,
536
+ mmcif_object=mmcif_object)
537
+ except SequenceNotInTemplateError:
538
+ # If PDB70 contains a different version of the template, we use the sequence
539
+ # from the mmcif_object.
540
+ chain_id = template_chain_id
541
+ warning = (
542
+ f'The exact sequence {template_sequence} was not found in '
543
+ f'{pdb_id}_{chain_id}. Realigning the template to the actual sequence.')
544
+ logging.warning(warning)
545
+ # This throws an exception if it fails to realign the hit.
546
+ seqres, mapping = _realign_pdb_template_to_query(
547
+ old_template_sequence=template_sequence,
548
+ template_chain_id=template_chain_id,
549
+ mmcif_object=mmcif_object,
550
+ old_mapping=mapping,
551
+ kalign_binary_path=kalign_binary_path)
552
+ logging.info('Sequence in %s_%s: %s successfully realigned to %s',
553
+ pdb_id, chain_id, template_sequence, seqres)
554
+ # The template sequence changed.
555
+ template_sequence = seqres
556
+ # No mapping offset, the query is aligned to the actual sequence.
557
+ mapping_offset = 0
558
+
559
+ try:
560
+ # Essentially set to infinity - we don't want to reject templates unless
561
+ # they're really really bad.
562
+ all_atom_positions, all_atom_mask = _get_atom_positions(
563
+ mmcif_object, chain_id, max_ca_ca_distance=150.0)
564
+ except (CaDistanceError, KeyError) as ex:
565
+ raise NoAtomDataInTemplateError(
566
+ 'Could not get atom data (%s_%s): %s' % (pdb_id, chain_id, str(ex))
567
+ ) from ex
568
+
569
+ all_atom_positions = np.split(all_atom_positions, all_atom_positions.shape[0])
570
+ all_atom_masks = np.split(all_atom_mask, all_atom_mask.shape[0])
571
+
572
+ output_templates_sequence = []
573
+ templates_all_atom_positions = []
574
+ templates_all_atom_masks = []
575
+
576
+ for _ in query_sequence:
577
+ # Residues in the query_sequence that are not in the template_sequence:
578
+ templates_all_atom_positions.append(
579
+ np.zeros((residue_constants.atom_type_num, 3)))
580
+ templates_all_atom_masks.append(np.zeros(residue_constants.atom_type_num))
581
+ output_templates_sequence.append('-')
582
+
583
+ for k, v in mapping.items():
584
+ template_index = v + mapping_offset
585
+ templates_all_atom_positions[k] = all_atom_positions[template_index][0]
586
+ templates_all_atom_masks[k] = all_atom_masks[template_index][0]
587
+ output_templates_sequence[k] = template_sequence[v]
588
+
589
+ # Alanine (AA with the lowest number of atoms) has 5 atoms (C, CA, CB, N, O).
590
+ if np.sum(templates_all_atom_masks) < 5:
591
+ raise TemplateAtomMaskAllZerosError(
592
+ 'Template all atom mask was all zeros: %s_%s. Residue range: %d-%d' %
593
+ (pdb_id, chain_id, min(mapping.values()) + mapping_offset,
594
+ max(mapping.values()) + mapping_offset))
595
+
596
+ output_templates_sequence = ''.join(output_templates_sequence)
597
+
598
+ templates_aatype = residue_constants.sequence_to_onehot(
599
+ output_templates_sequence, residue_constants.HHBLITS_AA_TO_ID)
600
+
601
+ return (
602
+ {
603
+ 'template_all_atom_positions': np.array(templates_all_atom_positions),
604
+ 'template_all_atom_masks': np.array(templates_all_atom_masks),
605
+ 'template_sequence': output_templates_sequence.encode(),
606
+ 'template_aatype': np.array(templates_aatype),
607
+ 'template_domain_names': f'{pdb_id.lower()}_{chain_id}'.encode(),
608
+ },
609
+ warning)
610
+
611
+
612
+ def _build_query_to_hit_index_mapping(
613
+ hit_query_sequence: str,
614
+ hit_sequence: str,
615
+ indices_hit: Sequence[int],
616
+ indices_query: Sequence[int],
617
+ original_query_sequence: str) -> Mapping[int, int]:
618
+ """Gets mapping from indices in original query sequence to indices in the hit.
619
+
620
+ hit_query_sequence and hit_sequence are two aligned sequences containing gap
621
+ characters. hit_query_sequence contains only the part of the original query
622
+ sequence that matched the hit. When interpreting the indices from the .hhr, we
623
+ need to correct for this to recover a mapping from original query sequence to
624
+ the hit sequence.
625
+
626
+ Args:
627
+ hit_query_sequence: The portion of the query sequence that is in the .hhr
628
+ hit
629
+ hit_sequence: The portion of the hit sequence that is in the .hhr
630
+ indices_hit: The indices for each aminoacid relative to the hit sequence
631
+ indices_query: The indices for each aminoacid relative to the original query
632
+ sequence
633
+ original_query_sequence: String describing the original query sequence.
634
+
635
+ Returns:
636
+ Dictionary with indices in the original query sequence as keys and indices
637
+ in the hit sequence as values.
638
+ """
639
+ # If the hit is empty (no aligned residues), return empty mapping
640
+ if not hit_query_sequence:
641
+ return {}
642
+
643
+ # Remove gaps and find the offset of hit.query relative to original query.
644
+ hhsearch_query_sequence = hit_query_sequence.replace('-', '')
645
+ hit_sequence = hit_sequence.replace('-', '')
646
+ hhsearch_query_offset = original_query_sequence.find(hhsearch_query_sequence)
647
+
648
+ # Index of -1 used for gap characters. Subtract the min index ignoring gaps.
649
+ min_idx = min(x for x in indices_hit if x > -1)
650
+ fixed_indices_hit = [
651
+ x - min_idx if x > -1 else -1 for x in indices_hit
652
+ ]
653
+
654
+ min_idx = min(x for x in indices_query if x > -1)
655
+ fixed_indices_query = [x - min_idx if x > -1 else -1 for x in indices_query]
656
+
657
+ # Zip the corrected indices, ignore case where both seqs have gap characters.
658
+ mapping = {}
659
+ for q_i, q_t in zip(fixed_indices_query, fixed_indices_hit):
660
+ if q_t != -1 and q_i != -1:
661
+ if (q_t >= len(hit_sequence) or
662
+ q_i + hhsearch_query_offset >= len(original_query_sequence)):
663
+ continue
664
+ mapping[q_i + hhsearch_query_offset] = q_t
665
+
666
+ return mapping
667
+
668
+
669
+ @dataclasses.dataclass(frozen=True)
670
+ class SingleHitResult:
671
+ features: Optional[Mapping[str, Any]]
672
+ error: Optional[str]
673
+ warning: Optional[str]
674
+
675
+
676
+ def _process_single_hit(
677
+ query_sequence: str,
678
+ query_pdb_code: Optional[str],
679
+ hit: parsers.TemplateHit,
680
+ mmcif_dir: str,
681
+ max_template_date: datetime.datetime,
682
+ release_dates: Mapping[str, datetime.datetime],
683
+ obsolete_pdbs: Mapping[str, Optional[str]],
684
+ kalign_binary_path: str,
685
+ strict_error_check: bool = False) -> SingleHitResult:
686
+ """Tries to extract template features from a single HHSearch hit."""
687
+ # Fail hard if we can't get the PDB ID and chain name from the hit.
688
+ hit_pdb_code, hit_chain_id = _get_pdb_id_and_chain(hit)
689
+
690
+ # This hit has been removed (obsoleted) from PDB, skip it.
691
+ if hit_pdb_code in obsolete_pdbs and obsolete_pdbs[hit_pdb_code] is None:
692
+ return SingleHitResult(
693
+ features=None, error=None, warning=f'Hit {hit_pdb_code} is obsolete.')
694
+
695
+ if hit_pdb_code not in release_dates:
696
+ if hit_pdb_code in obsolete_pdbs:
697
+ hit_pdb_code = obsolete_pdbs[hit_pdb_code]
698
+
699
+ # Pass hit_pdb_code since it might have changed due to the pdb being obsolete.
700
+ try:
701
+ _assess_hhsearch_hit(
702
+ hit=hit,
703
+ hit_pdb_code=hit_pdb_code,
704
+ query_sequence=query_sequence,
705
+ query_pdb_code=query_pdb_code,
706
+ release_dates=release_dates,
707
+ release_date_cutoff=max_template_date)
708
+ except PrefilterError as e:
709
+ msg = f'hit {hit_pdb_code}_{hit_chain_id} did not pass prefilter: {str(e)}'
710
+ logging.info('%s: %s', query_pdb_code, msg)
711
+ if strict_error_check and isinstance(
712
+ e, (DateError, PdbIdError, DuplicateError)):
713
+ # In strict mode we treat some prefilter cases as errors.
714
+ return SingleHitResult(features=None, error=msg, warning=None)
715
+
716
+ return SingleHitResult(features=None, error=None, warning=None)
717
+
718
+ mapping = _build_query_to_hit_index_mapping(
719
+ hit.query, hit.hit_sequence, hit.indices_hit, hit.indices_query,
720
+ query_sequence)
721
+
722
+ # The mapping is from the query to the actual hit sequence, so we need to
723
+ # remove gaps (which regardless have a missing confidence score).
724
+ template_sequence = hit.hit_sequence.replace('-', '')
725
+
726
+ cif_path = os.path.join(mmcif_dir, hit_pdb_code + '.cif')
727
+ logging.info('Reading PDB entry from %s. Query: %s, template: %s',
728
+ cif_path, query_sequence, template_sequence)
729
+ # Fail if we can't find the mmCIF file.
730
+ with open(cif_path, 'r') as cif_file:
731
+ cif_string = cif_file.read()
732
+
733
+ parsing_result = mmcif_parsing.parse(
734
+ file_id=hit_pdb_code, mmcif_string=cif_string)
735
+
736
+ if parsing_result.mmcif_object is not None:
737
+ hit_release_date = datetime.datetime.strptime(
738
+ parsing_result.mmcif_object.header['release_date'], '%Y-%m-%d')
739
+ if hit_release_date > max_template_date:
740
+ error = ('Template %s date (%s) > max template date (%s).' %
741
+ (hit_pdb_code, hit_release_date, max_template_date))
742
+ if strict_error_check:
743
+ return SingleHitResult(features=None, error=error, warning=None)
744
+ else:
745
+ logging.warning(error)
746
+ return SingleHitResult(features=None, error=None, warning=None)
747
+
748
+ try:
749
+ features, realign_warning = _extract_template_features(
750
+ mmcif_object=parsing_result.mmcif_object,
751
+ pdb_id=hit_pdb_code,
752
+ mapping=mapping,
753
+ template_sequence=template_sequence,
754
+ query_sequence=query_sequence,
755
+ template_chain_id=hit_chain_id,
756
+ kalign_binary_path=kalign_binary_path)
757
+ features['template_sum_probs'] = [hit.sum_probs]
758
+
759
+ # It is possible there were some errors when parsing the other chains in the
760
+ # mmCIF file, but the template features for the chain we want were still
761
+ # computed. In such case the mmCIF parsing errors are not relevant.
762
+ return SingleHitResult(
763
+ features=features, error=None, warning=realign_warning)
764
+ except (NoChainsError, NoAtomDataInTemplateError,
765
+ TemplateAtomMaskAllZerosError) as e:
766
+ # These 3 errors indicate missing mmCIF experimental data rather than a
767
+ # problem with the template search, so turn them into warnings.
768
+ warning = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: '
769
+ '%s, mmCIF parsing errors: %s'
770
+ % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index,
771
+ str(e), parsing_result.errors))
772
+ if strict_error_check:
773
+ return SingleHitResult(features=None, error=warning, warning=None)
774
+ else:
775
+ return SingleHitResult(features=None, error=None, warning=warning)
776
+ except Error as e:
777
+ error = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: '
778
+ '%s, mmCIF parsing errors: %s'
779
+ % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index,
780
+ str(e), parsing_result.errors))
781
+ return SingleHitResult(features=None, error=error, warning=None)
782
+
783
+
784
+ @dataclasses.dataclass(frozen=True)
785
+ class TemplateSearchResult:
786
+ features: Mapping[str, Any]
787
+ errors: Sequence[str]
788
+ warnings: Sequence[str]
789
+
790
+
791
+ class TemplateHitFeaturizer:
792
+ """A class for turning hhr hits to template features."""
793
+
794
+ def __init__(
795
+ self,
796
+ mmcif_dir: str,
797
+ max_template_date: str,
798
+ max_hits: int,
799
+ kalign_binary_path: str,
800
+ release_dates_path: Optional[str],
801
+ obsolete_pdbs_path: Optional[str],
802
+ strict_error_check: bool = False):
803
+ """Initializes the Template Search.
804
+
805
+ Args:
806
+ mmcif_dir: Path to a directory with mmCIF structures. Once a template ID
807
+ is found by HHSearch, this directory is used to retrieve the template
808
+ data.
809
+ max_template_date: The maximum date permitted for template structures. No
810
+ template with date higher than this date will be returned. In ISO8601
811
+ date format, YYYY-MM-DD.
812
+ max_hits: The maximum number of templates that will be returned.
813
+ kalign_binary_path: The path to a kalign executable used for template
814
+ realignment.
815
+ release_dates_path: An optional path to a file with a mapping from PDB IDs
816
+ to their release dates. Thanks to this we don't have to redundantly
817
+ parse mmCIF files to get that information.
818
+ obsolete_pdbs_path: An optional path to a file containing a mapping from
819
+ obsolete PDB IDs to the PDB IDs of their replacements.
820
+ strict_error_check: If True, then the following will be treated as errors:
821
+ * If any template date is after the max_template_date.
822
+ * If any template has identical PDB ID to the query.
823
+ * If any template is a duplicate of the query.
824
+ * Any feature computation errors.
825
+ """
826
+ self._mmcif_dir = mmcif_dir
827
+ if not glob.glob(os.path.join(self._mmcif_dir, '*.cif')):
828
+ logging.error('Could not find CIFs in %s', self._mmcif_dir)
829
+ raise ValueError(f'Could not find CIFs in {self._mmcif_dir}')
830
+
831
+ try:
832
+ self._max_template_date = datetime.datetime.strptime(
833
+ max_template_date, '%Y-%m-%d')
834
+ except ValueError:
835
+ raise ValueError(
836
+ 'max_template_date must be set and have format YYYY-MM-DD.')
837
+ self._max_hits = max_hits
838
+ self._kalign_binary_path = kalign_binary_path
839
+ self._strict_error_check = strict_error_check
840
+
841
+ if release_dates_path:
842
+ logging.info('Using precomputed release dates %s.', release_dates_path)
843
+ self._release_dates = _parse_release_dates(release_dates_path)
844
+ else:
845
+ self._release_dates = {}
846
+
847
+ if obsolete_pdbs_path:
848
+ logging.info('Using precomputed obsolete pdbs %s.', obsolete_pdbs_path)
849
+ self._obsolete_pdbs = _parse_obsolete(obsolete_pdbs_path)
850
+ else:
851
+ self._obsolete_pdbs = {}
852
+
853
+ def get_templates(
854
+ self,
855
+ query_sequence: str,
856
+ query_pdb_code: Optional[str],
857
+ query_release_date: Optional[datetime.datetime],
858
+ hits: Sequence[parsers.TemplateHit]) -> TemplateSearchResult:
859
+ """Computes the templates for given query sequence (more details above)."""
860
+ logging.info('Searching for template for: %s', query_pdb_code)
861
+
862
+ template_features = {}
863
+ for template_feature_name in TEMPLATE_FEATURES:
864
+ template_features[template_feature_name] = []
865
+
866
+ # Always use a max_template_date. Set to query_release_date minus 60 days
867
+ # if that's earlier.
868
+ template_cutoff_date = self._max_template_date
869
+ if query_release_date:
870
+ delta = datetime.timedelta(days=60)
871
+ if query_release_date - delta < template_cutoff_date:
872
+ template_cutoff_date = query_release_date - delta
873
+ assert template_cutoff_date < query_release_date
874
+ assert template_cutoff_date <= self._max_template_date
875
+
876
+ num_hits = 0
877
+ errors = []
878
+ warnings = []
879
+
880
+ for hit in sorted(hits, key=lambda x: x.sum_probs, reverse=True):
881
+ # We got all the templates we wanted, stop processing hits.
882
+ if num_hits >= self._max_hits:
883
+ break
884
+
885
+ result = _process_single_hit(
886
+ query_sequence=query_sequence,
887
+ query_pdb_code=query_pdb_code,
888
+ hit=hit,
889
+ mmcif_dir=self._mmcif_dir,
890
+ max_template_date=template_cutoff_date,
891
+ release_dates=self._release_dates,
892
+ obsolete_pdbs=self._obsolete_pdbs,
893
+ strict_error_check=self._strict_error_check,
894
+ kalign_binary_path=self._kalign_binary_path)
895
+
896
+ if result.error:
897
+ errors.append(result.error)
898
+
899
+ # There could be an error even if there are some results, e.g. thrown by
900
+ # other unparsable chains in the same mmCIF file.
901
+ if result.warning:
902
+ warnings.append(result.warning)
903
+
904
+ if result.features is None:
905
+ logging.info('Skipped invalid hit %s, error: %s, warning: %s',
906
+ hit.name, result.error, result.warning)
907
+ else:
908
+ # Increment the hit counter, since we got features out of this hit.
909
+ num_hits += 1
910
+ for k in template_features:
911
+ template_features[k].append(result.features[k])
912
+
913
+ for name in template_features:
914
+ if num_hits > 0:
915
+ template_features[name] = np.stack(
916
+ template_features[name], axis=0).astype(TEMPLATE_FEATURES[name])
917
+ else:
918
+ # Make sure the feature has correct dtype even if empty.
919
+ template_features[name] = np.array([], dtype=TEMPLATE_FEATURES[name])
920
+
921
+ return TemplateSearchResult(
922
+ features=template_features, errors=errors, warnings=warnings)
alphafold/alphafold/data/tools/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Python wrappers for third party tools."""
alphafold/alphafold/data/tools/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (202 Bytes). View file
 
alphafold/alphafold/data/tools/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (219 Bytes). View file
 
alphafold/alphafold/data/tools/__pycache__/hhblits.cpython-36.pyc ADDED
Binary file (4.44 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/hhblits.cpython-38.pyc ADDED
Binary file (4.51 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/hhsearch.cpython-36.pyc ADDED
Binary file (2.52 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/hhsearch.cpython-38.pyc ADDED
Binary file (2.58 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/jackhmmer.cpython-36.pyc ADDED
Binary file (5.23 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/jackhmmer.cpython-38.pyc ADDED
Binary file (5.34 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/kalign.cpython-36.pyc ADDED
Binary file (3.04 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/kalign.cpython-38.pyc ADDED
Binary file (3.09 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/utils.cpython-36.pyc ADDED
Binary file (929 Bytes). View file
 
alphafold/alphafold/data/tools/__pycache__/utils.cpython-38.pyc ADDED
Binary file (954 Bytes). View file
 
alphafold/alphafold/data/tools/hhblits.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Library to run HHblits from Python."""
16
+
17
+ import glob
18
+ import os
19
+ import subprocess
20
+ from typing import Any, Mapping, Optional, Sequence
21
+
22
+ from absl import logging
23
+ from alphafold.data.tools import utils
24
+ # Internal import (7716).
25
+
26
+
27
+ _HHBLITS_DEFAULT_P = 20
28
+ _HHBLITS_DEFAULT_Z = 500
29
+
30
+
31
+ class HHBlits:
32
+ """Python wrapper of the HHblits binary."""
33
+
34
+ def __init__(self,
35
+ *,
36
+ binary_path: str,
37
+ databases: Sequence[str],
38
+ n_cpu: int = 4,
39
+ n_iter: int = 3,
40
+ e_value: float = 0.001,
41
+ maxseq: int = 1_000_000,
42
+ realign_max: int = 100_000,
43
+ maxfilt: int = 100_000,
44
+ min_prefilter_hits: int = 1000,
45
+ all_seqs: bool = False,
46
+ alt: Optional[int] = None,
47
+ p: int = _HHBLITS_DEFAULT_P,
48
+ z: int = _HHBLITS_DEFAULT_Z):
49
+ """Initializes the Python HHblits wrapper.
50
+
51
+ Args:
52
+ binary_path: The path to the HHblits executable.
53
+ databases: A sequence of HHblits database paths. This should be the
54
+ common prefix for the database files (i.e. up to but not including
55
+ _hhm.ffindex etc.)
56
+ n_cpu: The number of CPUs to give HHblits.
57
+ n_iter: The number of HHblits iterations.
58
+ e_value: The E-value, see HHblits docs for more details.
59
+ maxseq: The maximum number of rows in an input alignment. Note that this
60
+ parameter is only supported in HHBlits version 3.1 and higher.
61
+ realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.
62
+ maxfilt: Max number of hits allowed to pass the 2nd prefilter.
63
+ HHblits default: 20000.
64
+ min_prefilter_hits: Min number of hits to pass prefilter.
65
+ HHblits default: 100.
66
+ all_seqs: Return all sequences in the MSA / Do not filter the result MSA.
67
+ HHblits default: False.
68
+ alt: Show up to this many alternative alignments.
69
+ p: Minimum Prob for a hit to be included in the output hhr file.
70
+ HHblits default: 20.
71
+ z: Hard cap on number of hits reported in the hhr file.
72
+ HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.
73
+
74
+ Raises:
75
+ RuntimeError: If HHblits binary not found within the path.
76
+ """
77
+ self.binary_path = binary_path
78
+ self.databases = databases
79
+
80
+ for database_path in self.databases:
81
+ if not glob.glob(database_path + '_*'):
82
+ logging.error('Could not find HHBlits database %s', database_path)
83
+ raise ValueError(f'Could not find HHBlits database {database_path}')
84
+
85
+ self.n_cpu = n_cpu
86
+ self.n_iter = n_iter
87
+ self.e_value = e_value
88
+ self.maxseq = maxseq
89
+ self.realign_max = realign_max
90
+ self.maxfilt = maxfilt
91
+ self.min_prefilter_hits = min_prefilter_hits
92
+ self.all_seqs = all_seqs
93
+ self.alt = alt
94
+ self.p = p
95
+ self.z = z
96
+
97
+ def query(self, input_fasta_path: str) -> Mapping[str, Any]:
98
+ """Queries the database using HHblits."""
99
+ with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
100
+ a3m_path = os.path.join(query_tmp_dir, 'output.a3m')
101
+
102
+ db_cmd = []
103
+ for db_path in self.databases:
104
+ db_cmd.append('-d')
105
+ db_cmd.append(db_path)
106
+ cmd = [
107
+ self.binary_path,
108
+ '-i', input_fasta_path,
109
+ '-cpu', str(self.n_cpu),
110
+ '-oa3m', a3m_path,
111
+ '-o', '/dev/null',
112
+ '-n', str(self.n_iter),
113
+ '-e', str(self.e_value),
114
+ '-maxseq', str(self.maxseq),
115
+ '-realign_max', str(self.realign_max),
116
+ '-maxfilt', str(self.maxfilt),
117
+ '-min_prefilter_hits', str(self.min_prefilter_hits)]
118
+ if self.all_seqs:
119
+ cmd += ['-all']
120
+ if self.alt:
121
+ cmd += ['-alt', str(self.alt)]
122
+ if self.p != _HHBLITS_DEFAULT_P:
123
+ cmd += ['-p', str(self.p)]
124
+ if self.z != _HHBLITS_DEFAULT_Z:
125
+ cmd += ['-Z', str(self.z)]
126
+ cmd += db_cmd
127
+
128
+ logging.info('Launching subprocess "%s"', ' '.join(cmd))
129
+ process = subprocess.Popen(
130
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
131
+
132
+ with utils.timing('HHblits query'):
133
+ stdout, stderr = process.communicate()
134
+ retcode = process.wait()
135
+
136
+ if retcode:
137
+ # Logs have a 15k character limit, so log HHblits error line by line.
138
+ logging.error('HHblits failed. HHblits stderr begin:')
139
+ for error_line in stderr.decode('utf-8').splitlines():
140
+ if error_line.strip():
141
+ logging.error(error_line.strip())
142
+ logging.error('HHblits stderr end')
143
+ raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % (
144
+ stdout.decode('utf-8'), stderr[:500_000].decode('utf-8')))
145
+
146
+ with open(a3m_path) as f:
147
+ a3m = f.read()
148
+
149
+ raw_output = dict(
150
+ a3m=a3m,
151
+ output=stdout,
152
+ stderr=stderr,
153
+ n_iter=self.n_iter,
154
+ e_value=self.e_value)
155
+ return raw_output
alphafold/alphafold/data/tools/hhsearch.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Library to run HHsearch from Python."""
16
+
17
+ import glob
18
+ import os
19
+ import subprocess
20
+ from typing import Sequence
21
+
22
+ from absl import logging
23
+
24
+ from alphafold.data.tools import utils
25
+ # Internal import (7716).
26
+
27
+
28
+ class HHSearch:
29
+ """Python wrapper of the HHsearch binary."""
30
+
31
+ def __init__(self,
32
+ *,
33
+ binary_path: str,
34
+ databases: Sequence[str],
35
+ maxseq: int = 1_000_000):
36
+ """Initializes the Python HHsearch wrapper.
37
+
38
+ Args:
39
+ binary_path: The path to the HHsearch executable.
40
+ databases: A sequence of HHsearch database paths. This should be the
41
+ common prefix for the database files (i.e. up to but not including
42
+ _hhm.ffindex etc.)
43
+ maxseq: The maximum number of rows in an input alignment. Note that this
44
+ parameter is only supported in HHBlits version 3.1 and higher.
45
+
46
+ Raises:
47
+ RuntimeError: If HHsearch binary not found within the path.
48
+ """
49
+ self.binary_path = binary_path
50
+ self.databases = databases
51
+ self.maxseq = maxseq
52
+
53
+ for database_path in self.databases:
54
+ if not glob.glob(database_path + '_*'):
55
+ logging.error('Could not find HHsearch database %s', database_path)
56
+ raise ValueError(f'Could not find HHsearch database {database_path}')
57
+
58
+ def query(self, a3m: str) -> str:
59
+ """Queries the database using HHsearch using a given a3m."""
60
+ with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
61
+ input_path = os.path.join(query_tmp_dir, 'query.a3m')
62
+ hhr_path = os.path.join(query_tmp_dir, 'output.hhr')
63
+ with open(input_path, 'w') as f:
64
+ f.write(a3m)
65
+
66
+ db_cmd = []
67
+ for db_path in self.databases:
68
+ db_cmd.append('-d')
69
+ db_cmd.append(db_path)
70
+ cmd = [self.binary_path,
71
+ '-i', input_path,
72
+ '-o', hhr_path,
73
+ '-maxseq', str(self.maxseq)
74
+ ] + db_cmd
75
+
76
+ logging.info('Launching subprocess "%s"', ' '.join(cmd))
77
+ process = subprocess.Popen(
78
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
79
+ with utils.timing('HHsearch query'):
80
+ stdout, stderr = process.communicate()
81
+ retcode = process.wait()
82
+
83
+ if retcode:
84
+ # Stderr is truncated to prevent proto size errors in Beam.
85
+ raise RuntimeError(
86
+ 'HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' % (
87
+ stdout.decode('utf-8'), stderr[:100_000].decode('utf-8')))
88
+
89
+ with open(hhr_path) as f:
90
+ hhr = f.read()
91
+ return hhr