acmc.parse
parse.py module
This module provides functionality to set up medical code translation classes
1""" 2parse.py module 3 4This module provides functionality to set up medical code translation classes 5 6""" 7 8import pandas as pd 9import numpy as np 10import os 11from typing import Callable, Optional, Tuple 12from pathlib import Path 13from acmc import trud, logging_config as lc 14 15# setup logging 16_logger = lc.setup_logger() 17 18SUPPORTED_CODE_TYPES = {"read2", "read3", "icd10", "snomed", "opcs4", "atc"} 19"""List of support medical coding types""" 20 21 22class CodesError: 23 """A class used in InvalidCodesException to report an error if a code parser check fails""" 24 25 def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None): 26 # initialise class variables with provided parameters 27 for key, value in locals().items(): 28 if key != "self": 29 setattr(self, key, value) 30 31 32class InvalidCodesException(Exception): 33 """Custom exception class raised when invalid codes are found that cannot be resolved by processing""" 34 35 def __init__(self, error): 36 super().__init__(error.message) 37 self.error = error 38 39 40class Proto: 41 """ 42 Define checks as list of 3 tuple: (Message, Condition, Process) 43 - Message = The name of the condition (what is printed and logged) 44 - Condition = True if Passed, and False if Failed 45 - Process = Aims to resolve all issues that stop condition from passing (Do not change index!) 46 """ 47 48 checks: list[ 49 tuple[ 50 str, # The description, e.g., "Not Empty" 51 Callable[ 52 [pd.DataFrame], 53 pd.Series, 54 ], # The first lambda function: takes a list and returns a pd.Series of booleans 55 Callable[ 56 [pd.DataFrame, Path], 57 pd.DataFrame, 58 ], # The second lambda function: takes a list and a string, and returns nothing 59 ] 60 ] 61 62 def __init__(self, name: str, trud_codes_path: Optional[Path] = None): 63 if trud_codes_path is not None: 64 if trud_codes_path.is_file(): 65 self.trud_codes_path: Path = trud_codes_path 66 self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path) 67 else: 68 raise FileNotFoundError( 69 f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly" 70 ) 71 72 self.name: str = name 73 74 def raise_exception(self, ex: Exception): 75 """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict""" 76 raise ex 77 78 def in_database( 79 self, codes: pd.DataFrame, db: pd.DataFrame, col: str 80 ) -> pd.DataFrame: 81 return codes.isin(db[col]) 82 83 def process( 84 self, codes: pd.DataFrame, codes_file: Path 85 ) -> Tuple[pd.DataFrame, list]: 86 """identify issues that do not pass and fix them with define/d process""" 87 errors = [] 88 # Iter through each item in check. 89 for msg, cond, fix in self.checks: 90 # Check if any codes fail the check to False 91 if not cond(codes).all(): 92 # Log the number of codes that failed 93 _logger.debug( 94 f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix" 95 ) 96 # try fix errors by running lamba "process" function 97 try: 98 codes = fix(codes, codes_file) 99 _logger.debug(f"Check: Fixed") 100 except InvalidCodesException as ex: 101 errors.append(ex.error) 102 codes = codes[cond(codes)] # remove codes that cannot be fixed 103 _logger.debug(f"Check: Invalid Codes Removed, no fix available") 104 else: 105 _logger.debug(f"Check: passed") 106 107 return codes, errors 108 109 def verify(self, codes: pd.DataFrame, codes_file: Path): 110 """verify codes in codes file""" 111 conds = np.array([]) 112 113 # Iter through each item in check. 114 for msg, cond, process in self.checks: 115 # run conditional check 116 out = cond(codes) 117 conds = np.append(conds, out.all()) 118 119 return conds 120 121 122class Read2(Proto): 123 """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.""" 124 125 def __init__(self): 126 super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet") 127 128 # validate checks 129 self.checks = [ 130 ( 131 # check codes are not empty, if empty throw an exception 132 "Not Empty", 133 lambda codes: pd.Series([len(codes) > 0]), 134 lambda codes, codes_file: self.raise_exception( 135 InvalidCodesException( 136 CodesError( 137 f"Code list is empty", 138 codes=codes, 139 codes_file=codes_file, 140 mask=None, 141 code_type=self.name, 142 ) 143 ) 144 ), 145 ), 146 ( 147 # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters 148 "Too Short", 149 lambda codes: ~(codes.str.len() < 5), 150 lambda codes, codes_file: codes.str.pad( 151 width=5, side="right", fillchar="." 152 ), 153 ), 154 ( 155 # check codes > 5 characters, If too long, truncates them to 5 characters 156 "Too Long", 157 lambda codes: ~(codes.str.len() > 5), 158 lambda codes, codes_file: codes.str[:5], 159 ), 160 ( 161 # checks codes contain numbers, or dots (.), if not logs invalid code error 162 "Alphanumeric Dot", 163 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 164 lambda codes, codes_file: self.raise_exception( 165 InvalidCodesException( 166 CodesError( 167 f"Illegal code format, not alphanumeric dot", 168 codes=codes, 169 codes_file=codes_file, 170 mask=codes.str.match(r"^[a-zA-Z0-9.]+$"), 171 code_type=self.name, 172 ) 173 ) 174 ), 175 ), 176 ( 177 # checks code exists in self.db (the Read2 dataset). If missing log invalid codes. 178 "In Database", 179 lambda codes: self.in_database(codes, self.db, self.name), 180 lambda codes, codes_file: self.raise_exception( 181 InvalidCodesException( 182 CodesError( 183 f"Codes do not exist in database", 184 codes=codes, 185 codes_file=codes_file, 186 mask=self.in_database(codes, self.db, self.name), 187 code_type=self.name, 188 ) 189 ) 190 ), 191 ), 192 ] 193 194 195class Read3(Proto): 196 def __init__(self): 197 super().__init__("read3", trud.PROCESSED_PATH / "read3.parquet") 198 199 self.checks = [ 200 ( 201 "Not Empty", 202 lambda codes: pd.Series([len(codes) > 0]), 203 lambda codes, codes_file: self.raise_exception( 204 InvalidCodesException( 205 CodesError( 206 f"Code list is empty", 207 codes=codes, 208 codes_file=codes_file, 209 mask=None, 210 code_type=self.name, 211 ) 212 ) 213 ), 214 ), 215 ( 216 "Too Short", 217 lambda codes: ~(codes.str.len() < 5), 218 lambda codes, codes_file: codes.str.pad( 219 width=5, side="right", fillchar="." 220 ), 221 ), 222 ( 223 "Too Long", 224 lambda codes: ~(codes.str.len() > 5), 225 lambda codes, codes_file: codes.str[:5], 226 ), 227 ( 228 "Alphanumeric Dot", 229 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 230 lambda codes, codes_file: self.raise_exception( 231 InvalidCodesException( 232 CodesError( 233 f"QA Alphanumeric Dot", 234 codes=codes, 235 codes_file=codes_file, 236 mask=codes.str.match(r"^[a-zA-Z0-9.]+$"), 237 code_type=self.name, 238 ) 239 ) 240 ), 241 ), 242 ( 243 "In Database", 244 lambda codes: self.in_database(codes, self.db, self.name), 245 lambda codes, codes_file: self.raise_exception( 246 InvalidCodesException( 247 CodesError( 248 f"QA In Database", 249 codes=codes, 250 codes_file=codes_file, 251 mask=self.in_database(codes, self.db, self.name), 252 code_type=self.name, 253 ) 254 ) 255 ), 256 ), 257 ] 258 259 260class Icd10(Proto): 261 def __init__(self): 262 super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet") 263 264 self.checks = [ 265 ( 266 "Not Empty", 267 lambda codes: pd.Series([len(codes) > 0]), 268 lambda codes, codes_file: self.raise_exception( 269 InvalidCodesException( 270 CodesError( 271 f"Code list is empty {codes_file}", 272 codes=codes, 273 codes_file=codes_file, 274 mask=None, 275 code_type=self.name, 276 ) 277 ) 278 ), 279 ), 280 ( 281 "Too Short", 282 lambda codes: ~(codes.str.len() < 3), 283 lambda codes, codes_file: self.raise_exception( 284 InvalidCodesException( 285 CodesError( 286 f"QA Too Short", 287 codes=codes, 288 codes_file=codes_file, 289 mask=~(codes.str.len() < 3), 290 code_type=self.name, 291 ) 292 ) 293 ), 294 ), 295 ( 296 "Has Dot", 297 lambda codes: ~(codes.str.match(r".*\..*")), # check if contains dot 298 lambda codes, codes_file: codes.str.replace( 299 ".", "" 300 ), # delete any dots in string 301 # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot 302 ), 303 ( 304 "Alphanumeric Capital", 305 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 306 lambda codes, codes_file: self.raise_exception( 307 InvalidCodesException( 308 CodesError( 309 f"QA Alphanumeric Capital", 310 codes=codes, 311 codes_file=codes_file, 312 mask=codes.str.match(r"^[A-Z0-9]+$"), 313 code_type=self.name, 314 ) 315 ) 316 ), 317 ), 318 ( 319 "In Database", 320 lambda codes: ~( 321 ~self.in_database(codes, self.db, self.name) 322 & ~self.in_database(codes, self.db, self.name + "_alt") 323 ), 324 lambda codes, codes_file: self.raise_exception( 325 InvalidCodesException( 326 CodesError( 327 f"QA In Database", 328 codes=codes, 329 codes_file=codes_file, 330 mask=~( 331 ~self.in_database(codes, self.db, self.name) 332 & ~self.in_database(codes, self.db, self.name + "_alt") 333 ), 334 code_type=self.name, 335 ) 336 ) 337 ), 338 ), 339 # ( 340 # "ICD10 Regex", 341 # lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum* 342 # lambda codes : lc.log_invalid_code(codes, 343 # codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows 344 # code_type="icd10", 345 # 346 # ) 347 ] 348 349 350class Snomed(Proto): 351 def __init__(self): 352 super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet") 353 354 self.checks = [ 355 # ( 356 # "Not Empty", 357 # lambda codes : pd.Series([len(codes) > 0]), 358 # lambda codes : raise_exception(Exception("Code List is Empty")) 359 # ), 360 ( 361 "Too Short", 362 lambda codes: ~(codes.str.len() < 6), 363 lambda codes, codes_file: self.raise_exception( 364 InvalidCodesException( 365 CodesError( 366 f"QA Too Short", 367 codes=codes, 368 codes_file=codes_file, 369 mask=~(codes.str.len() < 6), 370 code_type=self.name, 371 ) 372 ) 373 ), 374 ), 375 ( 376 "Too Long", 377 lambda codes: ~(codes.str.len() > 18), 378 lambda codes, codes_file: self.raise_exception( 379 InvalidCodesException( 380 CodesError( 381 f"QA Too Long", 382 codes=codes, 383 codes_file=codes_file, 384 mask=~(codes.str.len() > 18), 385 code_type=self.name, 386 ) 387 ) 388 ), 389 ), 390 ( 391 "Is Integer", 392 lambda codes: ~codes.str.contains("."), 393 lambda codes, codes_file: codes.str.split(".") 394 .str[0] 395 .astype(str), # Convert from float to integer and back to string 396 ), 397 ( 398 "Numeric", 399 lambda codes: codes.str.match(r"[0-9]+$"), 400 lambda codes, codes_file: self.raise_exception( 401 InvalidCodesException( 402 CodesError( 403 f"QA Numeric", 404 codes=codes, 405 codes_file=codes_file, 406 mask=codes.str.match(r"[0-9]+$"), 407 code_type=self.name, 408 ) 409 ) 410 ), 411 ), 412 ( 413 "In Database", 414 lambda codes: self.in_database(codes, self.db, self.name), 415 lambda codes, codes_file: self.raise_exception( 416 InvalidCodesException( 417 CodesError( 418 f"QA In Database", 419 codes=codes, 420 codes_file=codes_file, 421 mask=self.in_database(codes, self.db, self.name), 422 code_type=self.name, 423 ) 424 ) 425 ), 426 ), 427 ] 428 429 430class Opcs4(Proto): 431 def __init__(self): 432 super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet") 433 434 self.checks = [ 435 ( 436 "Not Empty", 437 lambda codes: pd.Series([len(codes) > 0]), 438 lambda codes, codes_file: self.raise_exception( 439 InvalidCodesException( 440 CodesError( 441 f"Code list is empty", 442 codes=codes, 443 codes_file=codes_file, 444 mask=None, 445 code_type=self.name, 446 ) 447 ) 448 ), 449 ), 450 ( 451 "In Database", 452 lambda codes: self.in_database(codes, self.db, self.name), 453 lambda codes, codes_file: self.raise_exception( 454 InvalidCodesException( 455 CodesError( 456 f"QA In Database", 457 codes=codes, 458 codes_file=codes_file, 459 mask=self.in_database(codes, self.db, self.name), 460 code_type=self.name, 461 ) 462 ) 463 ), 464 ), 465 ] 466 467 468class Atc(Proto): 469 def __init__(self): 470 super().__init__("atc", trud_codes_path=None) 471 self.checks = [ 472 ( 473 "Not Empty", 474 lambda codes: pd.Series([len(codes) > 0]), 475 lambda codes, codes_file: self.raise_exception( 476 InvalidCodesException( 477 CodesError( 478 f"Code list is empty", 479 codes=codes, 480 codes_file=codes_file, 481 mask=None, 482 code_type=self.name, 483 ) 484 ) 485 ), 486 ), 487 ( 488 "Alphanumeric Capital", 489 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 490 lambda codes, codes_file: self.raise_exception( 491 InvalidCodesException( 492 CodesError( 493 f"QA Alphanumeric Capital", 494 codes=codes, 495 codes_file=codes_file, 496 mask=codes.str.match(r"^[A-Z0-9]+$"), 497 code_type=self.name, 498 ) 499 ) 500 ), 501 ), 502 ] 503 504 505class Med(Proto): 506 def __init__(self): 507 super().__init__("med", trud_codes_path=None) 508 self.checks = [ 509 ( 510 "Not Empty", 511 lambda codes: pd.Series([len(codes) > 0]), 512 lambda codes, codes_file: self.raise_exception( 513 InvalidCodesException( 514 CodesError( 515 f"Code list is empty", 516 codes=codes, 517 codes_file=codes_file, 518 mask=None, 519 code_type=self.name, 520 ) 521 ) 522 ), 523 ) 524 ] 525 526 527class Cprd(Proto): 528 def __init__(self): 529 super().__init__("cprd", trud_codes_path=None) 530 self.checks = [ 531 ( 532 "Not Empty", 533 lambda codes: pd.Series([len(codes) > 0]), 534 lambda codes, codes_file: self.raise_exception( 535 InvalidCodesException( 536 CodesError( 537 f"Code list is empty", 538 codes=codes, 539 codes_file=codes_file, 540 mask=None, 541 code_type=self.name, 542 ) 543 ) 544 ), 545 ) 546 ] 547 548 549class CodeTypeParser: 550 """A class used in InvalidCodesException to report an error if a code parser check fails""" 551 552 def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH): 553 if not trud_processed_dir.exists() or not trud_processed_dir.is_dir(): 554 raise FileNotFoundError( 555 f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install" 556 ) 557 558 self.code_types = { 559 "read2": Read2(), 560 "read3": Read3(), 561 "icd10": Icd10(), 562 "snomed": Snomed(), 563 "opcs4": Opcs4(), 564 "atc": Atc(), 565 "med": Med(), 566 "cprd": Cprd(), 567 }
List of support medical coding types
23class CodesError: 24 """A class used in InvalidCodesException to report an error if a code parser check fails""" 25 26 def __init__(self, message, codes=None, codes_file=None, mask=None, code_type=None): 27 # initialise class variables with provided parameters 28 for key, value in locals().items(): 29 if key != "self": 30 setattr(self, key, value)
A class used in InvalidCodesException to report an error if a code parser check fails
33class InvalidCodesException(Exception): 34 """Custom exception class raised when invalid codes are found that cannot be resolved by processing""" 35 36 def __init__(self, error): 37 super().__init__(error.message) 38 self.error = error
Custom exception class raised when invalid codes are found that cannot be resolved by processing
41class Proto: 42 """ 43 Define checks as list of 3 tuple: (Message, Condition, Process) 44 - Message = The name of the condition (what is printed and logged) 45 - Condition = True if Passed, and False if Failed 46 - Process = Aims to resolve all issues that stop condition from passing (Do not change index!) 47 """ 48 49 checks: list[ 50 tuple[ 51 str, # The description, e.g., "Not Empty" 52 Callable[ 53 [pd.DataFrame], 54 pd.Series, 55 ], # The first lambda function: takes a list and returns a pd.Series of booleans 56 Callable[ 57 [pd.DataFrame, Path], 58 pd.DataFrame, 59 ], # The second lambda function: takes a list and a string, and returns nothing 60 ] 61 ] 62 63 def __init__(self, name: str, trud_codes_path: Optional[Path] = None): 64 if trud_codes_path is not None: 65 if trud_codes_path.is_file(): 66 self.trud_codes_path: Path = trud_codes_path 67 self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path) 68 else: 69 raise FileNotFoundError( 70 f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly" 71 ) 72 73 self.name: str = name 74 75 def raise_exception(self, ex: Exception): 76 """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict""" 77 raise ex 78 79 def in_database( 80 self, codes: pd.DataFrame, db: pd.DataFrame, col: str 81 ) -> pd.DataFrame: 82 return codes.isin(db[col]) 83 84 def process( 85 self, codes: pd.DataFrame, codes_file: Path 86 ) -> Tuple[pd.DataFrame, list]: 87 """identify issues that do not pass and fix them with define/d process""" 88 errors = [] 89 # Iter through each item in check. 90 for msg, cond, fix in self.checks: 91 # Check if any codes fail the check to False 92 if not cond(codes).all(): 93 # Log the number of codes that failed 94 _logger.debug( 95 f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix" 96 ) 97 # try fix errors by running lamba "process" function 98 try: 99 codes = fix(codes, codes_file) 100 _logger.debug(f"Check: Fixed") 101 except InvalidCodesException as ex: 102 errors.append(ex.error) 103 codes = codes[cond(codes)] # remove codes that cannot be fixed 104 _logger.debug(f"Check: Invalid Codes Removed, no fix available") 105 else: 106 _logger.debug(f"Check: passed") 107 108 return codes, errors 109 110 def verify(self, codes: pd.DataFrame, codes_file: Path): 111 """verify codes in codes file""" 112 conds = np.array([]) 113 114 # Iter through each item in check. 115 for msg, cond, process in self.checks: 116 # run conditional check 117 out = cond(codes) 118 conds = np.append(conds, out.all()) 119 120 return conds
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
63 def __init__(self, name: str, trud_codes_path: Optional[Path] = None): 64 if trud_codes_path is not None: 65 if trud_codes_path.is_file(): 66 self.trud_codes_path: Path = trud_codes_path 67 self.db: pd.DataFrame = pd.read_parquet(self.trud_codes_path) 68 else: 69 raise FileNotFoundError( 70 f"Error: Read2 code file '{trud_codes_path}' does not exist. Please ensure you have installed TRUD correctly" 71 ) 72 73 self.name: str = name
75 def raise_exception(self, ex: Exception): 76 """Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict""" 77 raise ex
Raises an exception inside a lambda function. Python does not allow using raise statement inside lambda because lambda can only contain expressions, not statements. Using raise_exception not raise_ as it's more explict
84 def process( 85 self, codes: pd.DataFrame, codes_file: Path 86 ) -> Tuple[pd.DataFrame, list]: 87 """identify issues that do not pass and fix them with define/d process""" 88 errors = [] 89 # Iter through each item in check. 90 for msg, cond, fix in self.checks: 91 # Check if any codes fail the check to False 92 if not cond(codes).all(): 93 # Log the number of codes that failed 94 _logger.debug( 95 f"Check: {msg} {(~cond(codes)).sum()} failed, trying to fix" 96 ) 97 # try fix errors by running lamba "process" function 98 try: 99 codes = fix(codes, codes_file) 100 _logger.debug(f"Check: Fixed") 101 except InvalidCodesException as ex: 102 errors.append(ex.error) 103 codes = codes[cond(codes)] # remove codes that cannot be fixed 104 _logger.debug(f"Check: Invalid Codes Removed, no fix available") 105 else: 106 _logger.debug(f"Check: passed") 107 108 return codes, errors
identify issues that do not pass and fix them with define/d process
110 def verify(self, codes: pd.DataFrame, codes_file: Path): 111 """verify codes in codes file""" 112 conds = np.array([]) 113 114 # Iter through each item in check. 115 for msg, cond, process in self.checks: 116 # run conditional check 117 out = cond(codes) 118 conds = np.append(conds, out.all()) 119 120 return conds
verify codes in codes file
123class Read2(Proto): 124 """This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.""" 125 126 def __init__(self): 127 super().__init__("read2", trud.PROCESSED_PATH / "read2.parquet") 128 129 # validate checks 130 self.checks = [ 131 ( 132 # check codes are not empty, if empty throw an exception 133 "Not Empty", 134 lambda codes: pd.Series([len(codes) > 0]), 135 lambda codes, codes_file: self.raise_exception( 136 InvalidCodesException( 137 CodesError( 138 f"Code list is empty", 139 codes=codes, 140 codes_file=codes_file, 141 mask=None, 142 code_type=self.name, 143 ) 144 ) 145 ), 146 ), 147 ( 148 # check codes <5 characters, if too short pads it with . (dots) to reach 5 characters 149 "Too Short", 150 lambda codes: ~(codes.str.len() < 5), 151 lambda codes, codes_file: codes.str.pad( 152 width=5, side="right", fillchar="." 153 ), 154 ), 155 ( 156 # check codes > 5 characters, If too long, truncates them to 5 characters 157 "Too Long", 158 lambda codes: ~(codes.str.len() > 5), 159 lambda codes, codes_file: codes.str[:5], 160 ), 161 ( 162 # checks codes contain numbers, or dots (.), if not logs invalid code error 163 "Alphanumeric Dot", 164 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 165 lambda codes, codes_file: self.raise_exception( 166 InvalidCodesException( 167 CodesError( 168 f"Illegal code format, not alphanumeric dot", 169 codes=codes, 170 codes_file=codes_file, 171 mask=codes.str.match(r"^[a-zA-Z0-9.]+$"), 172 code_type=self.name, 173 ) 174 ) 175 ), 176 ), 177 ( 178 # checks code exists in self.db (the Read2 dataset). If missing log invalid codes. 179 "In Database", 180 lambda codes: self.in_database(codes, self.db, self.name), 181 lambda codes, codes_file: self.raise_exception( 182 InvalidCodesException( 183 CodesError( 184 f"Codes do not exist in database", 185 codes=codes, 186 codes_file=codes_file, 187 mask=self.in_database(codes, self.db, self.name), 188 code_type=self.name, 189 ) 190 ) 191 ), 192 ), 193 ]
This Read2 class extends Proto, adding custom validation checks for a dataset of "Read2" codes. It ensures that the dataset is loaded, validates the codes based on several rules, and applies corrections or logs errors when necessary.
Inherited Members
196class Read3(Proto): 197 def __init__(self): 198 super().__init__("read3", trud.PROCESSED_PATH / "read3.parquet") 199 200 self.checks = [ 201 ( 202 "Not Empty", 203 lambda codes: pd.Series([len(codes) > 0]), 204 lambda codes, codes_file: self.raise_exception( 205 InvalidCodesException( 206 CodesError( 207 f"Code list is empty", 208 codes=codes, 209 codes_file=codes_file, 210 mask=None, 211 code_type=self.name, 212 ) 213 ) 214 ), 215 ), 216 ( 217 "Too Short", 218 lambda codes: ~(codes.str.len() < 5), 219 lambda codes, codes_file: codes.str.pad( 220 width=5, side="right", fillchar="." 221 ), 222 ), 223 ( 224 "Too Long", 225 lambda codes: ~(codes.str.len() > 5), 226 lambda codes, codes_file: codes.str[:5], 227 ), 228 ( 229 "Alphanumeric Dot", 230 lambda codes: codes.str.match(r"^[a-zA-Z0-9.]+$"), 231 lambda codes, codes_file: self.raise_exception( 232 InvalidCodesException( 233 CodesError( 234 f"QA Alphanumeric Dot", 235 codes=codes, 236 codes_file=codes_file, 237 mask=codes.str.match(r"^[a-zA-Z0-9.]+$"), 238 code_type=self.name, 239 ) 240 ) 241 ), 242 ), 243 ( 244 "In Database", 245 lambda codes: self.in_database(codes, self.db, self.name), 246 lambda codes, codes_file: self.raise_exception( 247 InvalidCodesException( 248 CodesError( 249 f"QA In Database", 250 codes=codes, 251 codes_file=codes_file, 252 mask=self.in_database(codes, self.db, self.name), 253 code_type=self.name, 254 ) 255 ) 256 ), 257 ), 258 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
261class Icd10(Proto): 262 def __init__(self): 263 super().__init__("icd10", trud.PROCESSED_PATH / "icd10.parquet") 264 265 self.checks = [ 266 ( 267 "Not Empty", 268 lambda codes: pd.Series([len(codes) > 0]), 269 lambda codes, codes_file: self.raise_exception( 270 InvalidCodesException( 271 CodesError( 272 f"Code list is empty {codes_file}", 273 codes=codes, 274 codes_file=codes_file, 275 mask=None, 276 code_type=self.name, 277 ) 278 ) 279 ), 280 ), 281 ( 282 "Too Short", 283 lambda codes: ~(codes.str.len() < 3), 284 lambda codes, codes_file: self.raise_exception( 285 InvalidCodesException( 286 CodesError( 287 f"QA Too Short", 288 codes=codes, 289 codes_file=codes_file, 290 mask=~(codes.str.len() < 3), 291 code_type=self.name, 292 ) 293 ) 294 ), 295 ), 296 ( 297 "Has Dot", 298 lambda codes: ~(codes.str.match(r".*\..*")), # check if contains dot 299 lambda codes, codes_file: codes.str.replace( 300 ".", "" 301 ), # delete any dots in string 302 # lambda codes : codes.str.split('\.').apply(lambda ls: ls[0]) #only get part before dot 303 ), 304 ( 305 "Alphanumeric Capital", 306 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 307 lambda codes, codes_file: self.raise_exception( 308 InvalidCodesException( 309 CodesError( 310 f"QA Alphanumeric Capital", 311 codes=codes, 312 codes_file=codes_file, 313 mask=codes.str.match(r"^[A-Z0-9]+$"), 314 code_type=self.name, 315 ) 316 ) 317 ), 318 ), 319 ( 320 "In Database", 321 lambda codes: ~( 322 ~self.in_database(codes, self.db, self.name) 323 & ~self.in_database(codes, self.db, self.name + "_alt") 324 ), 325 lambda codes, codes_file: self.raise_exception( 326 InvalidCodesException( 327 CodesError( 328 f"QA In Database", 329 codes=codes, 330 codes_file=codes_file, 331 mask=~( 332 ~self.in_database(codes, self.db, self.name) 333 & ~self.in_database(codes, self.db, self.name + "_alt") 334 ), 335 code_type=self.name, 336 ) 337 ) 338 ), 339 ), 340 # ( 341 # "ICD10 Regex", 342 # lambda codes : codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Alpha, Num, Num , Dot?, 4xAlphNum* 343 # lambda codes : lc.log_invalid_code(codes, 344 # codes.str.match("[a-zA-Z][0-9][0-9]\.?[a-zA-Z0-9]*$"), #Log non-matching rows 345 # code_type="icd10", 346 # 347 # ) 348 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
351class Snomed(Proto): 352 def __init__(self): 353 super().__init__("snomed", trud.PROCESSED_PATH / "snomed.parquet") 354 355 self.checks = [ 356 # ( 357 # "Not Empty", 358 # lambda codes : pd.Series([len(codes) > 0]), 359 # lambda codes : raise_exception(Exception("Code List is Empty")) 360 # ), 361 ( 362 "Too Short", 363 lambda codes: ~(codes.str.len() < 6), 364 lambda codes, codes_file: self.raise_exception( 365 InvalidCodesException( 366 CodesError( 367 f"QA Too Short", 368 codes=codes, 369 codes_file=codes_file, 370 mask=~(codes.str.len() < 6), 371 code_type=self.name, 372 ) 373 ) 374 ), 375 ), 376 ( 377 "Too Long", 378 lambda codes: ~(codes.str.len() > 18), 379 lambda codes, codes_file: self.raise_exception( 380 InvalidCodesException( 381 CodesError( 382 f"QA Too Long", 383 codes=codes, 384 codes_file=codes_file, 385 mask=~(codes.str.len() > 18), 386 code_type=self.name, 387 ) 388 ) 389 ), 390 ), 391 ( 392 "Is Integer", 393 lambda codes: ~codes.str.contains("."), 394 lambda codes, codes_file: codes.str.split(".") 395 .str[0] 396 .astype(str), # Convert from float to integer and back to string 397 ), 398 ( 399 "Numeric", 400 lambda codes: codes.str.match(r"[0-9]+$"), 401 lambda codes, codes_file: self.raise_exception( 402 InvalidCodesException( 403 CodesError( 404 f"QA Numeric", 405 codes=codes, 406 codes_file=codes_file, 407 mask=codes.str.match(r"[0-9]+$"), 408 code_type=self.name, 409 ) 410 ) 411 ), 412 ), 413 ( 414 "In Database", 415 lambda codes: self.in_database(codes, self.db, self.name), 416 lambda codes, codes_file: self.raise_exception( 417 InvalidCodesException( 418 CodesError( 419 f"QA In Database", 420 codes=codes, 421 codes_file=codes_file, 422 mask=self.in_database(codes, self.db, self.name), 423 code_type=self.name, 424 ) 425 ) 426 ), 427 ), 428 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
431class Opcs4(Proto): 432 def __init__(self): 433 super().__init__("opcs4", trud.PROCESSED_PATH / "opcs4.parquet") 434 435 self.checks = [ 436 ( 437 "Not Empty", 438 lambda codes: pd.Series([len(codes) > 0]), 439 lambda codes, codes_file: self.raise_exception( 440 InvalidCodesException( 441 CodesError( 442 f"Code list is empty", 443 codes=codes, 444 codes_file=codes_file, 445 mask=None, 446 code_type=self.name, 447 ) 448 ) 449 ), 450 ), 451 ( 452 "In Database", 453 lambda codes: self.in_database(codes, self.db, self.name), 454 lambda codes, codes_file: self.raise_exception( 455 InvalidCodesException( 456 CodesError( 457 f"QA In Database", 458 codes=codes, 459 codes_file=codes_file, 460 mask=self.in_database(codes, self.db, self.name), 461 code_type=self.name, 462 ) 463 ) 464 ), 465 ), 466 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
469class Atc(Proto): 470 def __init__(self): 471 super().__init__("atc", trud_codes_path=None) 472 self.checks = [ 473 ( 474 "Not Empty", 475 lambda codes: pd.Series([len(codes) > 0]), 476 lambda codes, codes_file: self.raise_exception( 477 InvalidCodesException( 478 CodesError( 479 f"Code list is empty", 480 codes=codes, 481 codes_file=codes_file, 482 mask=None, 483 code_type=self.name, 484 ) 485 ) 486 ), 487 ), 488 ( 489 "Alphanumeric Capital", 490 lambda codes: codes.str.match(r"^[A-Z0-9]+$"), 491 lambda codes, codes_file: self.raise_exception( 492 InvalidCodesException( 493 CodesError( 494 f"QA Alphanumeric Capital", 495 codes=codes, 496 codes_file=codes_file, 497 mask=codes.str.match(r"^[A-Z0-9]+$"), 498 code_type=self.name, 499 ) 500 ) 501 ), 502 ), 503 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
506class Med(Proto): 507 def __init__(self): 508 super().__init__("med", trud_codes_path=None) 509 self.checks = [ 510 ( 511 "Not Empty", 512 lambda codes: pd.Series([len(codes) > 0]), 513 lambda codes, codes_file: self.raise_exception( 514 InvalidCodesException( 515 CodesError( 516 f"Code list is empty", 517 codes=codes, 518 codes_file=codes_file, 519 mask=None, 520 code_type=self.name, 521 ) 522 ) 523 ), 524 ) 525 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
528class Cprd(Proto): 529 def __init__(self): 530 super().__init__("cprd", trud_codes_path=None) 531 self.checks = [ 532 ( 533 "Not Empty", 534 lambda codes: pd.Series([len(codes) > 0]), 535 lambda codes, codes_file: self.raise_exception( 536 InvalidCodesException( 537 CodesError( 538 f"Code list is empty", 539 codes=codes, 540 codes_file=codes_file, 541 mask=None, 542 code_type=self.name, 543 ) 544 ) 545 ), 546 ) 547 ]
Define checks as list of 3 tuple: (Message, Condition, Process)
- Message = The name of the condition (what is printed and logged)
- Condition = True if Passed, and False if Failed
- Process = Aims to resolve all issues that stop condition from passing (Do not change index!)
Inherited Members
550class CodeTypeParser: 551 """A class used in InvalidCodesException to report an error if a code parser check fails""" 552 553 def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH): 554 if not trud_processed_dir.exists() or not trud_processed_dir.is_dir(): 555 raise FileNotFoundError( 556 f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install" 557 ) 558 559 self.code_types = { 560 "read2": Read2(), 561 "read3": Read3(), 562 "icd10": Icd10(), 563 "snomed": Snomed(), 564 "opcs4": Opcs4(), 565 "atc": Atc(), 566 "med": Med(), 567 "cprd": Cprd(), 568 }
A class used in InvalidCodesException to report an error if a code parser check fails
553 def __init__(self, trud_processed_dir: Path = trud.PROCESSED_PATH): 554 if not trud_processed_dir.exists() or not trud_processed_dir.is_dir(): 555 raise FileNotFoundError( 556 f"Cannot initialise parsers as the TRUD processed directory {trud_processed_dir} does not exist, please check that TRUD has been installed: acmc trud install" 557 ) 558 559 self.code_types = { 560 "read2": Read2(), 561 "read3": Read3(), 562 "icd10": Icd10(), 563 "snomed": Snomed(), 564 "opcs4": Opcs4(), 565 "atc": Atc(), 566 "med": Med(), 567 "cprd": Cprd(), 568 }