acmc.parse

parse.py module

This module provides functionality to set up medical code translation classes

  1"""
  2parse.py module
  3
  4This module provides functionality to set up medical code translation classes
  5
  6"""
  7
  8import pandas as pd
  9import numpy as np
 10import os
 11from typing import Callable, Optional, Tuple
 12from pathlib import Path
 13from acmc import trud, logging_config as lc
 14
 15# setup logging
 16_logger = lc.setup_logger()
 17
 18SUPPORTED_CODE_TYPES = {"read2", "read3", "icd10", "snomed", "opcs4", "atc"}
 19"""List of support medical coding types"""
 20
 21
 22class CodesError:
 23    """A class used in InvalidCodesException to report an error if a code parser check fails"""
 24
 25    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
 26        # initialise class variables with provided parameters
 27        for key, value in locals().items():
 28            if key != "self":
 29                setattr(self, key, value)
 30
 31
 32class InvalidCodesException(Exception):
 33    """Custom exception class raised when invalid codes are found that cannot be resolved by processing"""
 34
 35    def __init__(self, error):
 36        super().__init__(error.message)
 37        self.error = error
 38
 39
 40class Proto:
 41    """
 42    Define checks as list of 3 tuple: (Message, Condition, Process)
 43    - Message = The name of the condition (what is printed and logged)
 44    - Condition = True if Passed, and False if Failed
 45    - Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
 46    """
 47
 48    checks: list[
 49        tuple[
 50            str,  # The description, e.g., "Not Empty"
 51            Callable[
 52                [pd.DataFrame],
 53                pd.Series,
 54            ],  # The first lambda function: takes a list and returns a pd.Series of booleans
 55            Callable[
 56                [pd.DataFrame, Path],
 57                pd.DataFrame,
 58            ],  # The second lambda function: takes a list and a string, and returns nothing
 59        ]
 60    ]
 61
 62    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
 63        if trud_codes_path is not None:
 64            if trud_codes_path.is_file():
 65                self.trud_codes_path: Path = trud_codes_path
 66                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
 67            else:
 68                raise FileNotFoundError(
 69                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
 70                )
 71
 72        self.name: str = name
 73
 74    def raise_exception(self, ex: Exception):
 75        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
 76        raise ex
 77
 78    def in_database(
 79        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
 80    ) -> pd.DataFrame:
 81        return codes.isin(db[col])
 82
 83    def process(
 84        self, codes: pd.DataFrame, codes_file: Path
 85    ) -> Tuple[pd.DataFrame, list]:
 86        """identify issues that do not pass and fix them with define/d process"""
 87        errors = []
 88        # Iter through each item in check.
 89        for msg, cond, fix in self.checks:
 90            # Check if any codes fail the check to False
 91            if not cond(codes).all():
 92                # Log the number of codes that failed
 93                _logger.debug(
 94                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 95                )
 96                # try fix errors by running lamba "process" function
 97                try:
 98                    codes = fix(codes, codes_file)
 99                    _logger.debug(f"Check: Fixed")
100                except InvalidCodesException as ex:
101                    errors.append(ex.error)
102                    codes = codes[cond(codes)]  # remove codes that cannot be fixed
103                    _logger.debug(f"Check: Invalid Codes Removed, no fix available")
104            else:
105                _logger.debug(f"Check: passed")
106
107        return codes, errors
108
109    def verify(self, codes: pd.DataFrame, codes_file: Path):
110        """verify codes in codes file"""
111        conds = np.array([])
112
113        # Iter through each item in check.
114        for msg, cond, process in self.checks:
115            # run conditional check
116            out = cond(codes)
117            conds = np.append(conds, out.all())
118
119        return conds
120
121
122class Read2(Proto):
123    """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary."""
124
125    def __init__(self):
126        super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet")
127
128        # validate checks
129        self.checks = [
130            (
131                # check codes are not empty, if empty throw an exception
132                "Not Empty",
133                lambda codes: pd.Series([len(codes) > 0]),
134                lambda codes, codes_file: self.raise_exception(
135                    InvalidCodesException(
136                        CodesError(
137                            f"Code list is empty",
138                            codes=codes,
139                            codes_file=codes_file,
140                            mask=None,
141                            code_type=self.name,
142                        )
143                    )
144                ),
145            ),
146            (
147                # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters
148                "Too Short",
149                lambda codes: ~(codes.str.len() < 5),
150                lambda codes, codes_file: codes.str.pad(
151                    width=5, side="right", fillchar="."
152                ),
153            ),
154            (
155                # check codes > 5 characters, If too long, truncates them to 5 characters
156                "Too Long",
157                lambda codes: ~(codes.str.len() > 5),
158                lambda codes, codes_file: codes.str[:5],
159            ),
160            (
161                # checks codes contain numbers, or dots (.), if not logs invalid code error
162                "Alphanumeric Dot",
163                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
164                lambda codes, codes_file: self.raise_exception(
165                    InvalidCodesException(
166                        CodesError(
167                            f"Illegal code format, not alphanumeric dot",
168                            codes=codes,
169                            codes_file=codes_file,
170                            mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
171                            code_type=self.name,
172                        )
173                    )
174                ),
175            ),
176            (
177                # checks code exists in self.db (the Read2 dataset). If missing log invalid codes.
178                "In Database",
179                lambda codes: self.in_database(codes, self.db, self.name),
180                lambda codes, codes_file: self.raise_exception(
181                    InvalidCodesException(
182                        CodesError(
183                            f"Codes do not exist in database",
184                            codes=codes,
185                            codes_file=codes_file,
186                            mask=self.in_database(codes, self.db, self.name),
187                            code_type=self.name,
188                        )
189                    )
190                ),
191            ),
192        ]
193
194
195class Read3(Proto):
196    def __init__(self):
197        super().__init__("read3", trud.PROCESSED_PATH / "read3.parquet")
198
199        self.checks = [
200            (
201                "Not Empty",
202                lambda codes: pd.Series([len(codes) > 0]),
203                lambda codes, codes_file: self.raise_exception(
204                    InvalidCodesException(
205                        CodesError(
206                            f"Code list is empty",
207                            codes=codes,
208                            codes_file=codes_file,
209                            mask=None,
210                            code_type=self.name,
211                        )
212                    )
213                ),
214            ),
215            (
216                "Too Short",
217                lambda codes: ~(codes.str.len() < 5),
218                lambda codes, codes_file: codes.str.pad(
219                    width=5, side="right", fillchar="."
220                ),
221            ),
222            (
223                "Too Long",
224                lambda codes: ~(codes.str.len() > 5),
225                lambda codes, codes_file: codes.str[:5],
226            ),
227            (
228                "Alphanumeric Dot",
229                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
230                lambda codes, codes_file: self.raise_exception(
231                    InvalidCodesException(
232                        CodesError(
233                            f"QA Alphanumeric Dot",
234                            codes=codes,
235                            codes_file=codes_file,
236                            mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
237                            code_type=self.name,
238                        )
239                    )
240                ),
241            ),
242            (
243                "In Database",
244                lambda codes: self.in_database(codes, self.db, self.name),
245                lambda codes, codes_file: self.raise_exception(
246                    InvalidCodesException(
247                        CodesError(
248                            f"QA In Database",
249                            codes=codes,
250                            codes_file=codes_file,
251                            mask=self.in_database(codes, self.db, self.name),
252                            code_type=self.name,
253                        )
254                    )
255                ),
256            ),
257        ]
258
259
260class Icd10(Proto):
261    def __init__(self):
262        super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet")
263
264        self.checks = [
265            (
266                "Not Empty",
267                lambda codes: pd.Series([len(codes) > 0]),
268                lambda codes, codes_file: self.raise_exception(
269                    InvalidCodesException(
270                        CodesError(
271                            f"Code list is empty {codes_file}",
272                            codes=codes,
273                            codes_file=codes_file,
274                            mask=None,
275                            code_type=self.name,
276                        )
277                    )
278                ),
279            ),
280            (
281                "Too Short",
282                lambda codes: ~(codes.str.len() < 3),
283                lambda codes, codes_file: self.raise_exception(
284                    InvalidCodesException(
285                        CodesError(
286                            f"QA Too Short",
287                            codes=codes,
288                            codes_file=codes_file,
289                            mask=~(codes.str.len() < 3),
290                            code_type=self.name,
291                        )
292                    )
293                ),
294            ),
295            (
296                "Has Dot",
297                lambda codes: ~(codes.str.match(r".*\..*")),  # check if contains dot
298                lambda codes, codes_file: codes.str.replace(
299                    ".", ""
300                ),  # delete any dots in string
301                # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot
302            ),
303            (
304                "Alphanumeric Capital",
305                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
306                lambda codes, codes_file: self.raise_exception(
307                    InvalidCodesException(
308                        CodesError(
309                            f"QA Alphanumeric Capital",
310                            codes=codes,
311                            codes_file=codes_file,
312                            mask=codes.str.match(r"^[A-Z0-9]+$"),
313                            code_type=self.name,
314                        )
315                    )
316                ),
317            ),
318            (
319                "In Database",
320                lambda codes: ~(
321                    ~self.in_database(codes, self.db, self.name)
322                    & ~self.in_database(codes, self.db, self.name + "_alt")
323                ),
324                lambda codes, codes_file: self.raise_exception(
325                    InvalidCodesException(
326                        CodesError(
327                            f"QA In Database",
328                            codes=codes,
329                            codes_file=codes_file,
330                            mask=~(
331                                ~self.in_database(codes, self.db, self.name)
332                                & ~self.in_database(codes, self.db, self.name + "_alt")
333                            ),
334                            code_type=self.name,
335                        )
336                    )
337                ),
338            ),
339            # 			(
340            # 				"ICD10 Regex",
341            # 				lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
342            # 				lambda codes : lc.log_invalid_code(codes,
343            # 												codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
344            # 												code_type="icd10",
345            #
346            # 			)
347        ]
348
349
350class Snomed(Proto):
351    def __init__(self):
352        super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet")
353
354        self.checks = [
355            # (
356            # 	"Not Empty",
357            # 	lambda codes : pd.Series([len(codes) > 0]),
358            # 	lambda codes : raise_exception(Exception("Code List is Empty"))
359            # ),
360            (
361                "Too Short",
362                lambda codes: ~(codes.str.len() < 6),
363                lambda codes, codes_file: self.raise_exception(
364                    InvalidCodesException(
365                        CodesError(
366                            f"QA Too Short",
367                            codes=codes,
368                            codes_file=codes_file,
369                            mask=~(codes.str.len() < 6),
370                            code_type=self.name,
371                        )
372                    )
373                ),
374            ),
375            (
376                "Too Long",
377                lambda codes: ~(codes.str.len() > 18),
378                lambda codes, codes_file: self.raise_exception(
379                    InvalidCodesException(
380                        CodesError(
381                            f"QA Too Long",
382                            codes=codes,
383                            codes_file=codes_file,
384                            mask=~(codes.str.len() > 18),
385                            code_type=self.name,
386                        )
387                    )
388                ),
389            ),
390            (
391                "Is Integer",
392                lambda codes: ~codes.str.contains("."),
393                lambda codes, codes_file: codes.str.split(".")
394                .str[0]
395                .astype(str),  # Convert from float to integer and back to string
396            ),
397            (
398                "Numeric",
399                lambda codes: codes.str.match(r"[0-9]+$"),
400                lambda codes, codes_file: self.raise_exception(
401                    InvalidCodesException(
402                        CodesError(
403                            f"QA Numeric",
404                            codes=codes,
405                            codes_file=codes_file,
406                            mask=codes.str.match(r"[0-9]+$"),
407                            code_type=self.name,
408                        )
409                    )
410                ),
411            ),
412            (
413                "In Database",
414                lambda codes: self.in_database(codes, self.db, self.name),
415                lambda codes, codes_file: self.raise_exception(
416                    InvalidCodesException(
417                        CodesError(
418                            f"QA In Database",
419                            codes=codes,
420                            codes_file=codes_file,
421                            mask=self.in_database(codes, self.db, self.name),
422                            code_type=self.name,
423                        )
424                    )
425                ),
426            ),
427        ]
428
429
430class Opcs4(Proto):
431    def __init__(self):
432        super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet")
433
434        self.checks = [
435            (
436                "Not Empty",
437                lambda codes: pd.Series([len(codes) > 0]),
438                lambda codes, codes_file: self.raise_exception(
439                    InvalidCodesException(
440                        CodesError(
441                            f"Code list is empty",
442                            codes=codes,
443                            codes_file=codes_file,
444                            mask=None,
445                            code_type=self.name,
446                        )
447                    )
448                ),
449            ),
450            (
451                "In Database",
452                lambda codes: self.in_database(codes, self.db, self.name),
453                lambda codes, codes_file: self.raise_exception(
454                    InvalidCodesException(
455                        CodesError(
456                            f"QA In Database",
457                            codes=codes,
458                            codes_file=codes_file,
459                            mask=self.in_database(codes, self.db, self.name),
460                            code_type=self.name,
461                        )
462                    )
463                ),
464            ),
465        ]
466
467
468class Atc(Proto):
469    def __init__(self):
470        super().__init__("atc", trud_codes_path=None)
471        self.checks = [
472            (
473                "Not Empty",
474                lambda codes: pd.Series([len(codes) > 0]),
475                lambda codes, codes_file: self.raise_exception(
476                    InvalidCodesException(
477                        CodesError(
478                            f"Code list is empty",
479                            codes=codes,
480                            codes_file=codes_file,
481                            mask=None,
482                            code_type=self.name,
483                        )
484                    )
485                ),
486            ),
487            (
488                "Alphanumeric Capital",
489                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
490                lambda codes, codes_file: self.raise_exception(
491                    InvalidCodesException(
492                        CodesError(
493                            f"QA Alphanumeric Capital",
494                            codes=codes,
495                            codes_file=codes_file,
496                            mask=codes.str.match(r"^[A-Z0-9]+$"),
497                            code_type=self.name,
498                        )
499                    )
500                ),
501            ),
502        ]
503
504
505class Med(Proto):
506    def __init__(self):
507        super().__init__("med", trud_codes_path=None)
508        self.checks = [
509            (
510                "Not Empty",
511                lambda codes: pd.Series([len(codes) > 0]),
512                lambda codes, codes_file: self.raise_exception(
513                    InvalidCodesException(
514                        CodesError(
515                            f"Code list is empty",
516                            codes=codes,
517                            codes_file=codes_file,
518                            mask=None,
519                            code_type=self.name,
520                        )
521                    )
522                ),
523            )
524        ]
525
526
527class Cprd(Proto):
528    def __init__(self):
529        super().__init__("cprd", trud_codes_path=None)
530        self.checks = [
531            (
532                "Not Empty",
533                lambda codes: pd.Series([len(codes) > 0]),
534                lambda codes, codes_file: self.raise_exception(
535                    InvalidCodesException(
536                        CodesError(
537                            f"Code list is empty",
538                            codes=codes,
539                            codes_file=codes_file,
540                            mask=None,
541                            code_type=self.name,
542                        )
543                    )
544                ),
545            )
546        ]
547
548
549class CodeTypeParser:
550    """A class used in InvalidCodesException to report an error if a code parser check fails"""
551
552    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
553        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
554            raise FileNotFoundError(
555                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
556            )
557
558        self.code_types = {
559            "read2": Read2(),
560            "read3": Read3(),
561            "icd10": Icd10(),
562            "snomed": Snomed(),
563            "opcs4": Opcs4(),
564            "atc": Atc(),
565            "med": Med(),
566            "cprd": Cprd(),
567        }
SUPPORTED_CODE_TYPES = {'read3', 'opcs4', 'snomed', 'atc', 'icd10', 'read2'}

List of support medical coding types

class CodesError:
23class CodesError:
24    """A class used in InvalidCodesException to report an error if a code parser check fails"""
25
26    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
27        # initialise class variables with provided parameters
28        for key, value in locals().items():
29            if key != "self":
30                setattr(self, key, value)

A class used in InvalidCodesException to report an error if a code parser check fails

CodesError(message, codes=None, codes_file=None, mask=None, code_type=None)
26    def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None):
27        # initialise class variables with provided parameters
28        for key, value in locals().items():
29            if key != "self":
30                setattr(self, key, value)
class InvalidCodesException(builtins.Exception):
33class InvalidCodesException(Exception):
34    """Custom exception class raised when invalid codes are found that cannot be resolved by processing"""
35
36    def __init__(self, error):
37        super().__init__(error.message)
38        self.error = error

Custom exception class raised when invalid codes are found that cannot be resolved by processing

InvalidCodesException(error)
36    def __init__(self, error):
37        super().__init__(error.message)
38        self.error = error
error
class Proto:
 41class Proto:
 42    """
 43    Define checks as list of 3 tuple: (Message, Condition, Process)
 44    - Message = The name of the condition (what is printed and logged)
 45    - Condition = True if Passed, and False if Failed
 46    - Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
 47    """
 48
 49    checks: list[
 50        tuple[
 51            str,  # The description, e.g., "Not Empty"
 52            Callable[
 53                [pd.DataFrame],
 54                pd.Series,
 55            ],  # The first lambda function: takes a list and returns a pd.Series of booleans
 56            Callable[
 57                [pd.DataFrame, Path],
 58                pd.DataFrame,
 59            ],  # The second lambda function: takes a list and a string, and returns nothing
 60        ]
 61    ]
 62
 63    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
 64        if trud_codes_path is not None:
 65            if trud_codes_path.is_file():
 66                self.trud_codes_path: Path = trud_codes_path
 67                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
 68            else:
 69                raise FileNotFoundError(
 70                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
 71                )
 72
 73        self.name: str = name
 74
 75    def raise_exception(self, ex: Exception):
 76        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
 77        raise ex
 78
 79    def in_database(
 80        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
 81    ) -> pd.DataFrame:
 82        return codes.isin(db[col])
 83
 84    def process(
 85        self, codes: pd.DataFrame, codes_file: Path
 86    ) -> Tuple[pd.DataFrame, list]:
 87        """identify issues that do not pass and fix them with define/d process"""
 88        errors = []
 89        # Iter through each item in check.
 90        for msg, cond, fix in self.checks:
 91            # Check if any codes fail the check to False
 92            if not cond(codes).all():
 93                # Log the number of codes that failed
 94                _logger.debug(
 95                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 96                )
 97                # try fix errors by running lamba "process" function
 98                try:
 99                    codes = fix(codes, codes_file)
100                    _logger.debug(f"Check: Fixed")
101                except InvalidCodesException as ex:
102                    errors.append(ex.error)
103                    codes = codes[cond(codes)]  # remove codes that cannot be fixed
104                    _logger.debug(f"Check: Invalid Codes Removed, no fix available")
105            else:
106                _logger.debug(f"Check: passed")
107
108        return codes, errors
109
110    def verify(self, codes: pd.DataFrame, codes_file: Path):
111        """verify codes in codes file"""
112        conds = np.array([])
113
114        # Iter through each item in check.
115        for msg, cond, process in self.checks:
116            # run conditional check
117            out = cond(codes)
118            conds = np.append(conds, out.all())
119
120        return conds

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Proto(name: str, trud_codes_path: Optional[pathlib.Path] = None)
63    def __init__(self, name: str, trud_codes_path: Optional[Path] = None):
64        if trud_codes_path is not None:
65            if trud_codes_path.is_file():
66                self.trud_codes_path: Path = trud_codes_path
67                self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path)
68            else:
69                raise FileNotFoundError(
70                    f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly"
71                )
72
73        self.name: str = name
checks: list[tuple[str, typing.Callable[[pandas.core.frame.DataFrame], pandas.core.series.Series], typing.Callable[[pandas.core.frame.DataFrame, pathlib.Path], pandas.core.frame.DataFrame]]]
name: str
def raise_exception(self, ex: Exception):
75    def raise_exception(self, ex: Exception):
76        """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict"""
77        raise ex

Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict

def in_database( self, codes: pandas.core.frame.DataFrame, db: pandas.core.frame.DataFrame, col: str) -> pandas.core.frame.DataFrame:
79    def in_database(
80        self, codes: pd.DataFrame, db: pd.DataFrame, col: str
81    ) -> pd.DataFrame:
82        return codes.isin(db[col])
def process( self, codes: pandas.core.frame.DataFrame, codes_file: pathlib.Path) -> Tuple[pandas.core.frame.DataFrame, list]:
 84    def process(
 85        self, codes: pd.DataFrame, codes_file: Path
 86    ) -> Tuple[pd.DataFrame, list]:
 87        """identify issues that do not pass and fix them with define/d process"""
 88        errors = []
 89        # Iter through each item in check.
 90        for msg, cond, fix in self.checks:
 91            # Check if any codes fail the check to False
 92            if not cond(codes).all():
 93                # Log the number of codes that failed
 94                _logger.debug(
 95                    f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix"
 96                )
 97                # try fix errors by running lamba "process" function
 98                try:
 99                    codes = fix(codes, codes_file)
100                    _logger.debug(f"Check: Fixed")
101                except InvalidCodesException as ex:
102                    errors.append(ex.error)
103                    codes = codes[cond(codes)]  # remove codes that cannot be fixed
104                    _logger.debug(f"Check: Invalid Codes Removed, no fix available")
105            else:
106                _logger.debug(f"Check: passed")
107
108        return codes, errors

identify issues that do not pass and fix them with define/d process

def verify(self, codes: pandas.core.frame.DataFrame, codes_file: pathlib.Path):
110    def verify(self, codes: pd.DataFrame, codes_file: Path):
111        """verify codes in codes file"""
112        conds = np.array([])
113
114        # Iter through each item in check.
115        for msg, cond, process in self.checks:
116            # run conditional check
117            out = cond(codes)
118            conds = np.append(conds, out.all())
119
120        return conds

verify codes in codes file

class Read2(Proto):
123class Read2(Proto):
124    """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary."""
125
126    def __init__(self):
127        super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet")
128
129        # validate checks
130        self.checks = [
131            (
132                # check codes are not empty, if empty throw an exception
133                "Not Empty",
134                lambda codes: pd.Series([len(codes) > 0]),
135                lambda codes, codes_file: self.raise_exception(
136                    InvalidCodesException(
137                        CodesError(
138                            f"Code list is empty",
139                            codes=codes,
140                            codes_file=codes_file,
141                            mask=None,
142                            code_type=self.name,
143                        )
144                    )
145                ),
146            ),
147            (
148                # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters
149                "Too Short",
150                lambda codes: ~(codes.str.len() < 5),
151                lambda codes, codes_file: codes.str.pad(
152                    width=5, side="right", fillchar="."
153                ),
154            ),
155            (
156                # check codes > 5 characters, If too long, truncates them to 5 characters
157                "Too Long",
158                lambda codes: ~(codes.str.len() > 5),
159                lambda codes, codes_file: codes.str[:5],
160            ),
161            (
162                # checks codes contain numbers, or dots (.), if not logs invalid code error
163                "Alphanumeric Dot",
164                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
165                lambda codes, codes_file: self.raise_exception(
166                    InvalidCodesException(
167                        CodesError(
168                            f"Illegal code format, not alphanumeric dot",
169                            codes=codes,
170                            codes_file=codes_file,
171                            mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
172                            code_type=self.name,
173                        )
174                    )
175                ),
176            ),
177            (
178                # checks code exists in self.db (the Read2 dataset). If missing log invalid codes.
179                "In Database",
180                lambda codes: self.in_database(codes, self.db, self.name),
181                lambda codes, codes_file: self.raise_exception(
182                    InvalidCodesException(
183                        CodesError(
184                            f"Codes do not exist in database",
185                            codes=codes,
186                            codes_file=codes_file,
187                            mask=self.in_database(codes, self.db, self.name),
188                            code_type=self.name,
189                        )
190                    )
191                ),
192            ),
193        ]

This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.

checks
class Read3(Proto):
196class Read3(Proto):
197    def __init__(self):
198        super().__init__("read3", trud.PROCESSED_PATH / "read3.parquet")
199
200        self.checks = [
201            (
202                "Not Empty",
203                lambda codes: pd.Series([len(codes) > 0]),
204                lambda codes, codes_file: self.raise_exception(
205                    InvalidCodesException(
206                        CodesError(
207                            f"Code list is empty",
208                            codes=codes,
209                            codes_file=codes_file,
210                            mask=None,
211                            code_type=self.name,
212                        )
213                    )
214                ),
215            ),
216            (
217                "Too Short",
218                lambda codes: ~(codes.str.len() < 5),
219                lambda codes, codes_file: codes.str.pad(
220                    width=5, side="right", fillchar="."
221                ),
222            ),
223            (
224                "Too Long",
225                lambda codes: ~(codes.str.len() > 5),
226                lambda codes, codes_file: codes.str[:5],
227            ),
228            (
229                "Alphanumeric Dot",
230                lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"),
231                lambda codes, codes_file: self.raise_exception(
232                    InvalidCodesException(
233                        CodesError(
234                            f"QA Alphanumeric Dot",
235                            codes=codes,
236                            codes_file=codes_file,
237                            mask=codes.str.match(r"^[a-zA-Z0-9.]+$"),
238                            code_type=self.name,
239                        )
240                    )
241                ),
242            ),
243            (
244                "In Database",
245                lambda codes: self.in_database(codes, self.db, self.name),
246                lambda codes, codes_file: self.raise_exception(
247                    InvalidCodesException(
248                        CodesError(
249                            f"QA In Database",
250                            codes=codes,
251                            codes_file=codes_file,
252                            mask=self.in_database(codes, self.db, self.name),
253                            code_type=self.name,
254                        )
255                    )
256                ),
257            ),
258        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Icd10(Proto):
261class Icd10(Proto):
262    def __init__(self):
263        super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet")
264
265        self.checks = [
266            (
267                "Not Empty",
268                lambda codes: pd.Series([len(codes) > 0]),
269                lambda codes, codes_file: self.raise_exception(
270                    InvalidCodesException(
271                        CodesError(
272                            f"Code list is empty {codes_file}",
273                            codes=codes,
274                            codes_file=codes_file,
275                            mask=None,
276                            code_type=self.name,
277                        )
278                    )
279                ),
280            ),
281            (
282                "Too Short",
283                lambda codes: ~(codes.str.len() < 3),
284                lambda codes, codes_file: self.raise_exception(
285                    InvalidCodesException(
286                        CodesError(
287                            f"QA Too Short",
288                            codes=codes,
289                            codes_file=codes_file,
290                            mask=~(codes.str.len() < 3),
291                            code_type=self.name,
292                        )
293                    )
294                ),
295            ),
296            (
297                "Has Dot",
298                lambda codes: ~(codes.str.match(r".*\..*")),  # check if contains dot
299                lambda codes, codes_file: codes.str.replace(
300                    ".", ""
301                ),  # delete any dots in string
302                # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot
303            ),
304            (
305                "Alphanumeric Capital",
306                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
307                lambda codes, codes_file: self.raise_exception(
308                    InvalidCodesException(
309                        CodesError(
310                            f"QA Alphanumeric Capital",
311                            codes=codes,
312                            codes_file=codes_file,
313                            mask=codes.str.match(r"^[A-Z0-9]+$"),
314                            code_type=self.name,
315                        )
316                    )
317                ),
318            ),
319            (
320                "In Database",
321                lambda codes: ~(
322                    ~self.in_database(codes, self.db, self.name)
323                    & ~self.in_database(codes, self.db, self.name + "_alt")
324                ),
325                lambda codes, codes_file: self.raise_exception(
326                    InvalidCodesException(
327                        CodesError(
328                            f"QA In Database",
329                            codes=codes,
330                            codes_file=codes_file,
331                            mask=~(
332                                ~self.in_database(codes, self.db, self.name)
333                                & ~self.in_database(codes, self.db, self.name + "_alt")
334                            ),
335                            code_type=self.name,
336                        )
337                    )
338                ),
339            ),
340            # 			(
341            # 				"ICD10 Regex",
342            # 				lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum*
343            # 				lambda codes : lc.log_invalid_code(codes,
344            # 												codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows
345            # 												code_type="icd10",
346            #
347            # 			)
348        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Snomed(Proto):
351class Snomed(Proto):
352    def __init__(self):
353        super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet")
354
355        self.checks = [
356            # (
357            # 	"Not Empty",
358            # 	lambda codes : pd.Series([len(codes) > 0]),
359            # 	lambda codes : raise_exception(Exception("Code List is Empty"))
360            # ),
361            (
362                "Too Short",
363                lambda codes: ~(codes.str.len() < 6),
364                lambda codes, codes_file: self.raise_exception(
365                    InvalidCodesException(
366                        CodesError(
367                            f"QA Too Short",
368                            codes=codes,
369                            codes_file=codes_file,
370                            mask=~(codes.str.len() < 6),
371                            code_type=self.name,
372                        )
373                    )
374                ),
375            ),
376            (
377                "Too Long",
378                lambda codes: ~(codes.str.len() > 18),
379                lambda codes, codes_file: self.raise_exception(
380                    InvalidCodesException(
381                        CodesError(
382                            f"QA Too Long",
383                            codes=codes,
384                            codes_file=codes_file,
385                            mask=~(codes.str.len() > 18),
386                            code_type=self.name,
387                        )
388                    )
389                ),
390            ),
391            (
392                "Is Integer",
393                lambda codes: ~codes.str.contains("."),
394                lambda codes, codes_file: codes.str.split(".")
395                .str[0]
396                .astype(str),  # Convert from float to integer and back to string
397            ),
398            (
399                "Numeric",
400                lambda codes: codes.str.match(r"[0-9]+$"),
401                lambda codes, codes_file: self.raise_exception(
402                    InvalidCodesException(
403                        CodesError(
404                            f"QA Numeric",
405                            codes=codes,
406                            codes_file=codes_file,
407                            mask=codes.str.match(r"[0-9]+$"),
408                            code_type=self.name,
409                        )
410                    )
411                ),
412            ),
413            (
414                "In Database",
415                lambda codes: self.in_database(codes, self.db, self.name),
416                lambda codes, codes_file: self.raise_exception(
417                    InvalidCodesException(
418                        CodesError(
419                            f"QA In Database",
420                            codes=codes,
421                            codes_file=codes_file,
422                            mask=self.in_database(codes, self.db, self.name),
423                            code_type=self.name,
424                        )
425                    )
426                ),
427            ),
428        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Opcs4(Proto):
431class Opcs4(Proto):
432    def __init__(self):
433        super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet")
434
435        self.checks = [
436            (
437                "Not Empty",
438                lambda codes: pd.Series([len(codes) > 0]),
439                lambda codes, codes_file: self.raise_exception(
440                    InvalidCodesException(
441                        CodesError(
442                            f"Code list is empty",
443                            codes=codes,
444                            codes_file=codes_file,
445                            mask=None,
446                            code_type=self.name,
447                        )
448                    )
449                ),
450            ),
451            (
452                "In Database",
453                lambda codes: self.in_database(codes, self.db, self.name),
454                lambda codes, codes_file: self.raise_exception(
455                    InvalidCodesException(
456                        CodesError(
457                            f"QA In Database",
458                            codes=codes,
459                            codes_file=codes_file,
460                            mask=self.in_database(codes, self.db, self.name),
461                            code_type=self.name,
462                        )
463                    )
464                ),
465            ),
466        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Atc(Proto):
469class Atc(Proto):
470    def __init__(self):
471        super().__init__("atc", trud_codes_path=None)
472        self.checks = [
473            (
474                "Not Empty",
475                lambda codes: pd.Series([len(codes) > 0]),
476                lambda codes, codes_file: self.raise_exception(
477                    InvalidCodesException(
478                        CodesError(
479                            f"Code list is empty",
480                            codes=codes,
481                            codes_file=codes_file,
482                            mask=None,
483                            code_type=self.name,
484                        )
485                    )
486                ),
487            ),
488            (
489                "Alphanumeric Capital",
490                lambda codes: codes.str.match(r"^[A-Z0-9]+$"),
491                lambda codes, codes_file: self.raise_exception(
492                    InvalidCodesException(
493                        CodesError(
494                            f"QA Alphanumeric Capital",
495                            codes=codes,
496                            codes_file=codes_file,
497                            mask=codes.str.match(r"^[A-Z0-9]+$"),
498                            code_type=self.name,
499                        )
500                    )
501                ),
502            ),
503        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Med(Proto):
506class Med(Proto):
507    def __init__(self):
508        super().__init__("med", trud_codes_path=None)
509        self.checks = [
510            (
511                "Not Empty",
512                lambda codes: pd.Series([len(codes) > 0]),
513                lambda codes, codes_file: self.raise_exception(
514                    InvalidCodesException(
515                        CodesError(
516                            f"Code list is empty",
517                            codes=codes,
518                            codes_file=codes_file,
519                            mask=None,
520                            code_type=self.name,
521                        )
522                    )
523                ),
524            )
525        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class Cprd(Proto):
528class Cprd(Proto):
529    def __init__(self):
530        super().__init__("cprd", trud_codes_path=None)
531        self.checks = [
532            (
533                "Not Empty",
534                lambda codes: pd.Series([len(codes) > 0]),
535                lambda codes, codes_file: self.raise_exception(
536                    InvalidCodesException(
537                        CodesError(
538                            f"Code list is empty",
539                            codes=codes,
540                            codes_file=codes_file,
541                            mask=None,
542                            code_type=self.name,
543                        )
544                    )
545                ),
546            )
547        ]

Define checks as list of 3 tuple: (Message, Condition, Process)

  • Message = The name of the condition (what is printed and logged)
  • Condition = True if Passed, and False if Failed
  • Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
checks
class CodeTypeParser:
550class CodeTypeParser:
551    """A class used in InvalidCodesException to report an error if a code parser check fails"""
552
553    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
554        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
555            raise FileNotFoundError(
556                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
557            )
558
559        self.code_types = {
560            "read2": Read2(),
561            "read3": Read3(),
562            "icd10": Icd10(),
563            "snomed": Snomed(),
564            "opcs4": Opcs4(),
565            "atc": Atc(),
566            "med": Med(),
567            "cprd": Cprd(),
568        }

A class used in InvalidCodesException to report an error if a code parser check fails

CodeTypeParser(trud_processed_dir: pathlib.Path = PosixPath('vocab/trud/processed'))
553    def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH):
554        if not trud_processed_dir.exists() or not trud_processed_dir.is_dir():
555            raise FileNotFoundError(
556                f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install"
557            )
558
559        self.code_types = {
560            "read2": Read2(),
561            "read3": Read3(),
562            "icd10": Icd10(),
563            "snomed": Snomed(),
564            "opcs4": Opcs4(),
565            "atc": Atc(),
566            "med": Med(),
567            "cprd": Cprd(),
568        }
code_types