Skip to content

Commit 3477495

Browse files
Update rules to forbid ambiguous boolean operator precedence
And Or precedence is well defined in general. And is a higher precedence than or and should be evaluated first, but it causes enough mistakes that I have decided to raise a parsing error when things like A | B & C happen. Now the user must include some grouping with [] to specify the precedence Also with the grammer change, it was easiest to make or and and chain together, like PipeChain instead of nesting. This should ideally also be more efficient.
1 parent b3c5bde commit 3477495

5 files changed

Lines changed: 175 additions & 132 deletions

File tree

src/rules.lark

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,32 @@
33

44
step_expr: expr ("," expr)+ -> step_
55

6+
# expr can be a pure-or chain, a pure-and chain, or a single pipe_expr
67
?expr: or_expr
8+
| and_expr
9+
| pipe_expr
710

8-
?or_expr: and_expr
9-
| or_expr "|" and_expr -> or_
11+
# OR is a chain of pipe_expr operands (NOT and_expr operands)
12+
or_expr: pipe_expr ("|" pipe_expr)+ -> or_
1013

11-
?and_expr: pipe_expr
12-
| and_expr "&" pipe_expr -> and_
14+
# AND is a chain of pipe_expr operands (NOT or_expr operands)
15+
and_expr: pipe_expr ("&" pipe_expr)+ -> and_
1316

1417
?pipe_expr: atom
1518
| pipe_expr "->" call -> pipe_
1619

1720
?atom: call
1821
| literal
1922
| "[" expr "]" -> group
23+
| "[" step_expr "]" -> group
2024

21-
call: IDENT "(" [expr ("," expr)*] ")" -> call
25+
call: IDENT "(" [arg ("," arg)*] ")" -> call
26+
27+
// list literal as a VALUE (requires at least one comma)
28+
list_lit: "[" expr ("," expr)+ "]" -> step_
29+
30+
?arg: list_lit
31+
| expr
2232

2333
?literal: name_ref
2434
| NUMBER -> number

src/rules.py

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
import numpy as np
1111
import polars as pl
12-
from lark import Lark, Transformer
12+
from lark import Lark, Transformer, LarkError
1313

1414
OP_TO_EXPR = {
1515
"gt": operator.gt,
@@ -89,6 +89,12 @@ class Expr:
8989
def validate(self):
9090
pass
9191

92+
def pprint(self):
93+
"""Pretty print the expression for debugging."""
94+
from pprint import pprint
95+
96+
pprint(self)
97+
9298

9399
@dataclass(frozen=True)
94100
class Name(Expr):
@@ -108,14 +114,12 @@ class String(Expr):
108114

109115
@dataclass(frozen=True)
110116
class And(Expr):
111-
a: Expr
112-
b: Expr
117+
parts: Tuple[Expr, ...]
113118

114119

115120
@dataclass(frozen=True)
116121
class Or(Expr):
117-
a: Expr
118-
b: Expr
122+
parts: Tuple[Expr, ...]
119123

120124

121125
@dataclass(frozen=True)
@@ -247,13 +251,14 @@ def string(self, items):
247251
return String(s)
248252

249253
def and_(self, items):
250-
return And(items[0], items[1])
254+
return And(parts=tuple(items))
251255

252256
def or_(self, items):
253-
return Or(items[0], items[1])
257+
return Or(parts=tuple(items))
254258

255259
def group(self, items):
256260
# square brackets are just grouping, not a node
261+
assert len(items) == 1, "Grouping isn't supposed to change number of items"
257262
return items[0]
258263

259264
def step_(self, items):
@@ -297,7 +302,6 @@ def from_rules(cls, *args, **kwargs) -> CompiledRules:
297302
k: expand_macros(v, defs_expanded, needed_features=needed_features)
298303
for k, v in rules.items()
299304
}
300-
301305
return cls(rules=rules_expanded, needed_features=needed_features)
302306

303307

@@ -338,11 +342,32 @@ def load_rules(
338342
with open(Path(__file__).parent.absolute() / "rules.lark") as f:
339343
parser = Lark(f, parser="lalr", transformer=ASTTransformer())
340344

345+
def parse_rule_expr(expr_str: str) -> Expr:
346+
"""We use a closure to capture the parser instance since
347+
polars map_elements doesn't let us pass extra args."""
348+
try:
349+
return parser.parse(expr_str)
350+
except LarkError as e:
351+
n_left_b = expr_str.count("[")
352+
n_right_b = expr_str.count("]")
353+
if n_left_b != n_right_b:
354+
raise RuleError(
355+
f"Possible mismatched brackets in rule expression: {expr_str}"
356+
) from e
357+
if expr_str.count("&") > 0 and expr_str.count("|") > 0:
358+
# both used; likely missing brackets
359+
raise RuleError(
360+
f"Possible ambiguous use of '&' or '|' without surrounding brackets `[ ]`. Check that you don't have a situation like `A | B & C` or `A & B | C`. These are generally not allowed because they can be ambiguous. These should be written as `[A | B] & C` or `[A & B] | C`: {expr_str}."
361+
) from e
362+
raise RuleError(
363+
f"Error parsing rule expression for unknown reason. Possible hints could be found in the above exceptions from the parsing library Lark: {expr_str}"
364+
) from e
365+
341366
lf = lf.with_columns(
342367
[
343368
pl.col(rules_col)
344369
.str.strip_chars()
345-
.map_elements(parser.parse, return_dtype=pl.Object)
370+
.map_elements(parse_rule_expr, return_dtype=pl.Object)
346371
]
347372
)
348373

@@ -400,9 +425,9 @@ def recurse(e: Expr, add_name_to_needed: bool = True) -> Expr:
400425
elif isinstance(e, (Number, String)):
401426
out = e
402427
elif isinstance(e, And):
403-
out = And(recurse(e.a), recurse(e.b))
428+
out = And(parts=tuple(recurse(part) for part in e.parts))
404429
elif isinstance(e, Or):
405-
out = Or(recurse(e.a), recurse(e.b))
430+
out = Or(parts=tuple(recurse(part) for part in e.parts))
406431
elif isinstance(e, Steps):
407432
out = Steps(tuple(recurse(p) for p in e.parts))
408433
elif isinstance(e, Call):
@@ -529,13 +554,18 @@ def eval_bool(self, expr: Expr) -> np.ndarray:
529554
return out
530555

531556
if isinstance(expr, And):
532-
out = np.logical_and(self.eval_bool(expr.a), self.eval_bool(expr.b))
557+
out = np.all(
558+
np.stack([self.eval_bool(part) for part in expr.parts], axis=0), axis=0
559+
)
533560
self._memo[expr] = out
534561
return out
535562

536563
if isinstance(expr, Or):
537564
try:
538-
out = np.logical_or(self.eval_bool(expr.a), self.eval_bool(expr.b))
565+
out = np.any(
566+
np.stack([self.eval_bool(part) for part in expr.parts], axis=0),
567+
axis=0,
568+
)
539569
except ValueError as e:
540570
print(f"Error evaluating OR expression: {expr}")
541571
raise
@@ -565,10 +595,10 @@ def eval_bool(self, expr: Expr) -> np.ndarray:
565595

566596
if isinstance(expr, Steps):
567597
raise RuleError(
568-
f"Step expressions (expression seperated by commas)"
598+
"Step expressions (expression seperated by commas)"
569599
" logic cannot be evaluted on their own. They must be used"
570600
" in a supporting function such as `percent`. Offending"
571-
" rule: {expr}"
601+
f" rule: {expr}"
572602
)
573603

574604
raise RuleError(

tests/data/basic_anno.tsv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
query_id input_fasta kegg_id kegg_description sulfur_id fegenie_id kofam_id merops_id methyl_id dbcan_id camper_id heme_regulatory_motif_count
2-
bin1_1 bin1 K00399 mez:Mtc_0908 (K00399) mcrA; methyl-coenzyme M reductase alpha subunit [EC:2.8.4.1] K00399 0
2+
bin1_1 bin1 K00399 mez:Mtc_0908 (K00399) mcrA; methyl-coenzyme M reductase alpha subunit [EC:2.8.4.1] K00399 5
33
bin1_2 bin1 K14126 "mez:Mtc_2479 (K14126) mvhA, vhuA, vhcA; F420-non-reducing hydrogenase large subunit [EC:1.12.99.- 1.8.98.5]" K14126 4
4-
bin1_3 bin1 K14128 "mez:Mtc_2478 (K14128) mvhG, vhuG, vhcG; F420-non-reducing hydrogenase small subunit [EC:1.12.99.- 1.8.98.5]" K14128 2
4+
bin1_3 bin1 K14128 "mez:Mtc_2478 (K14128) mvhG, vhuG, vhcG; F420-non-reducing hydrogenase small subunit [EC:1.12.99.- 1.8.98.5]" K14128 4
55
bin1_4 bin1 K00205 "rci:RCIX592 (K00205) fwdF, fmdF; 4Fe-4S ferredoxin" K00205 0
66
bin2_1 bin2 K03390 mla:Mlab_0239 (K03390) hdrC2; heterodisulfide reductase subunit C2 [EC:1.8.7.3 1.8.98.4 1.8.98.5 1.8.98.6] K03390 MER0283993 0
77
bin2_2 bin2 K03389 mez:Mtc_2474 (K03389) hdrB2; heterodisulfide reductase subunit B2 [EC:1.8.7.3 1.8.98.4 1.8.98.5 1.8.98.6] K03389 0

tests/data/compound_rules.tsv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ B b1 b2 | b4
66
b2 percent(50,b3)
77
b3 K00399,[K00401 & K00402 & [K00333 | K03421] & [K03422 | K13380]],K00400,K00401,K00402,K03421,K03422
88
b4 K02591 | K02586 | K02588
9-
C c1 c1 & c2
10-
c1 not(c3)
11-
c2 column_count_values(heme_regulatory_motif_count,ge,4,ge,3)
12-
c3 column_contains(kegg_description,NitrateReducer)
9+
C c1 c2
10+
c2 c3 -> column_count_values(heme_regulatory_motif_count,ge,2,ge,2)
11+
c3 not(c4)
12+
c4 filter_contains(kegg_description,"nitrate reductase")
1313
D d1 percent(50,d2)
1414
d2 K00399,K14126,K02588

0 commit comments

Comments
 (0)