AbsaOSS · yruslan · May 28, 2026 · May 22, 2026 · May 26, 2026 · May 26, 2026
@@ -974,7 +974,7 @@ segment id. This way Cobrix will parse only relevant segment redefined fields an
   .option("redefine-segment-id-map:1", "REDEFINED_FIELD2 => SegmentId10,SegmentId11,...")
 ```
 
-For the above example the load options will lok like this (last 2 options):
+For the above example the load options will look like this (last 2 options):
 ```scala
 val df = spark
   .read
@@ -1011,6 +1011,79 @@ df.show(10)
 In the above example invalid fields became `null` and the parsing is done faster because Cobrix does not need to process
 every redefine for each record.
 
+## Automatic filtering of arbitrary redefines
+
+Arbitrary redefines can be resolved using rule expressions. This doesn't have to be segment redefines, just any redefines.
+For example, for a copybook that looks like this: 
+```cobol
+        01  COMPANY-DETAILS.
+************** RECORD-TYPE CAN BE 'C' for company, and 'P' or 'E' for person.        
+            05  RECORD-TYPE          PIC X(1).
+            05  COMPANY-ID           PIC X(10).
+            05  COMPANY.
+               10  COMPANY-NAME      PIC X(15).
+               10  ADDRESS           PIC X(25).
+            05  PERSON REDEFINES COMPANY.
+               10  FIRST-NAME        PIC X(30).
+               10  LAST-NAME         PIC X(30).
+```
+
+The syntax is as follows: 
+
+```scala
+  .option("redefine_rule:1", "COMPANY => RECORD_TYPE = 'C'")
+  .option("redefine_rule:2", "PERSON => in(RECORD_TYPE, 'P', 'E')")
+```
+
+For the above example the load options will look like this (last 2 options):
+```scala
+val df = spark
+  .read
+  .format("cobol")
+  .option("copybook_contents", copybook)
+  .option("record_format", "V")
+  .option("redefine_rule:1", "COMPANY => RECORD_TYPE = 'C'")
+  .option("redefine_rule:2", "PERSON => in(RECORD_TYPE, 'P', 'E')")
+  .load("examples/multisegment_data/COMP.DETAILS.SEP30.DATA.dat")
+```
+
+The filtered data will look like this:
+```
+df.show(10)
++-----------+----------+--------------------+--------------------+
+|RECORD_TYPE|COMPANY_ID|             COMPANY|              PERSON|
++-----------+----------+--------------------+--------------------+
+|          C|9377942526|[Joan Q & Z,10 Sa...|                    |
+|          P|9377942526|                    |       [John, Smith]|
+|          C|3483483977|[Robotrd Inc.,2 P...|                    |
+|          E|3483483977|                    |      [Jane, Wanson]|
+|          E|3483483977|                    |      [Alex,Johnson]|
++-----------+----------+--------------------+--------------------+
+```
+
+#### Notes
+- Variable names in rule expressions are case-sensitive.
+- Variable names are required to be used after column sanitization (e.g. replacement of special characters with underscores),
+  otherwise expression `F-A = 1` is ambiguous since it is not clear if `F-A` is a variable name or an expression of subtraction. 
+  In this case the variable name should be `F_A` and the expression should be `F_A = 1`.
+- You can only reference variables that go _before_ the redefine field. This is because record decoding is forward only.
+- Use only field names themselves, not full paths, e.g. `COMPANY` instead of `RECORD.DETAILS.COMPANY` .
+- Only integral numeric literals are supported. Decimals are not supported.
+- The expression should return a boolean. For example: 
+  - `RECORD_TYPE = 'C'` is valid since it returns true for company records and false for person records.
+  - `in(RECORD_TYPE, 'P', 'E')` is valid since it returns true for person records and false for company records.
+  - `COMPANY_ID > 1000` is valid since it returns true for records with company id greater than 1000 and false otherwise.
+
+### Expressions supported
+- Comparison operators: `=`, `!=`, `>`, `<`, `>=`, `<=`.
+- Boolean logic: `&&` (and), `||` (or), `!` (not).
+- Integral literals: `123`, `0`, `-456` are valid, but `123.45` or `-123.45` are not valid.
+- String literals: `'abc'`, `'abc'`, `'123'` are valid. Always use single quote character.
+- Boolean literals: `true`, `false`.
+- Null literal: `null`. For example: `RECORD_TYPE = 'C' || RECORD_TYPE = null` is valid.
+- Functions:
+  - `in()` (one of from the list), for example: `in(RECORD_TYPE, 'P', 'E')` is valid since it returns true for person records and false for company records.
+  - `if()` (conditional function with 3 arguments), for example: `if(RECORD_TYPE = 'C', true, false)` is valid.
 
 ## Group Filler dropping
 

@@ -25,6 +25,7 @@ import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPoint
 import za.co.absa.cobrix.cobol.parser.encoding.codepage.{CodePage, CodePageCommon}
 import za.co.absa.cobrix.cobol.parser.encoding.{EBCDIC, Encoding}
 import za.co.absa.cobrix.cobol.parser.exceptions.SyntaxErrorException
+import za.co.absa.cobrix.cobol.parser.expression.ExpressionEvaluator
 import za.co.absa.cobrix.cobol.parser.policies.DebugFieldsPolicy.DebugFieldsPolicy
 import za.co.absa.cobrix.cobol.parser.policies.StringTrimmingPolicy.StringTrimmingPolicy
 import za.co.absa.cobrix.cobol.parser.policies.{CommentPolicy, DebugFieldsPolicy, FillerNamingPolicy, StringTrimmingPolicy}
@@ -124,6 +125,7 @@ object CopybookParser extends Logging {
     * @param isUtf16BigEndian      If true UTF-16 strings are considered big-endian.
     * @param floatingPointFormat   A format of floating-point numbers (IBM/IEEE754).
     * @param nonTerminals          A list of non-terminals that should be extracted as strings.
+    * @param redefineRuleExpressions A map of REDEFINE field names to expressions that determine which redefine alternative to use when parsing records.
     * @param debugFieldsPolicy     Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
     * @return Seq[Group] where a group is a record inside the copybook.
     */
@@ -147,6 +149,7 @@ object CopybookParser extends Logging {
             floatingPointFormat: FloatingPointFormat = FloatingPointFormat.IBM,
             nonTerminals: Seq[String] = Nil,
             occursHandlers: Map[String, Map[String, Int]] = Map(),
+            redefineRuleExpressions: Map[String, ExpressionEvaluator] = Map.empty,
             debugFieldsPolicy: DebugFieldsPolicy = DebugFieldsPolicy.NoDebug,
             fieldCodePageMap: Map[String, String] = Map.empty[String, String]): Copybook = {
     parseTree(dataEncoding,
@@ -169,6 +172,7 @@ object CopybookParser extends Logging {
       floatingPointFormat,
       nonTerminals,
       occursHandlers,
+      redefineRuleExpressions,
       debugFieldsPolicy,
       fieldCodePageMap)
   }
@@ -192,6 +196,7 @@ object CopybookParser extends Logging {
     * @param isUtf16BigEndian      If true UTF-16 strings are considered big-endian.
     * @param floatingPointFormat   A format of floating-point numbers (IBM/IEEE754)
     * @param nonTerminals          A list of non-terminals that should be extracted as strings
+    * @param redefineRuleExpressions A map of REDEFINE field names to expressions that determine which redefine alternative to use when parsing records.
     * @param debugFieldsPolicy     Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
     * @return Seq[Group] where a group is a record inside the copybook
     */
@@ -214,6 +219,7 @@ object CopybookParser extends Logging {
                 floatingPointFormat: FloatingPointFormat = FloatingPointFormat.IBM,
                 nonTerminals: Seq[String] = Nil,
                 occursHandlers: Map[String, Map[String, Int]] = Map(),
+                redefineRuleExpressions: Map[String, ExpressionEvaluator] = Map.empty,
                 debugFieldsPolicy: DebugFieldsPolicy = DebugFieldsPolicy.NoDebug,
                 fieldCodePageMap: Map[String, String] = Map.empty[String, String]): Copybook = {
     parseTree(EBCDIC,
@@ -236,6 +242,7 @@ object CopybookParser extends Logging {
       floatingPointFormat,
       nonTerminals,
       occursHandlers,
+      redefineRuleExpressions,
       debugFieldsPolicy,
       fieldCodePageMap)
   }
@@ -259,6 +266,7 @@ object CopybookParser extends Logging {
     * @param isUtf16BigEndian      If true UTF-16 strings are considered big-endian.
     * @param floatingPointFormat   A format of floating-point numbers (IBM/IEEE754)
     * @param nonTerminals          A list of non-terminals that should be extracted as strings
+    * @param redefineRuleExpressions A map of REDEFINE field names to expressions that determine which redefine alternative to use when parsing records.
     * @param debugFieldsPolicy     Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
     * @return Seq[Group] where a group is a record inside the copybook
     */
@@ -283,6 +291,7 @@ object CopybookParser extends Logging {
                 floatingPointFormat: FloatingPointFormat,
                 nonTerminals: Seq[String],
                 occursHandlers: Map[String, Map[String, Int]],
+                redefineRuleExpressions: Map[String, ExpressionEvaluator],
                 debugFieldsPolicy: DebugFieldsPolicy,
                 fieldCodePageMap: Map[String, String]): Copybook = {
 
@@ -313,7 +322,9 @@ object CopybookParser extends Logging {
       // Add debugging fields if debug mode is enabled.
       DebugFieldsAdder(debugFieldsPolicy),
       // For each group calculates the number of non-filler items.
-      NonFillerCountSetter()
+      NonFillerCountSetter(),
+      // Sets isUsedInRules and rule expressions for each field
+      RuleExpressionSetter(redefineRuleExpressions)
     )
 
     val transformedAst = transformers.foldLeft(schemaANTLR) {

@@ -858,7 +858,9 @@ class ParserVisitor(enc: Encoding,
       if (occurs.isDefined) occurs.get.dep else None,
       Map(),
       isDependee = false,
-      identifier.toUpperCase() == Constants.FILLER,
+      isUsedInRules = false,
+      isFiller = identifier.toUpperCase() == Constants.FILLER,
+      None,
       DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, isDisplayAlwaysString, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian = isUtf16BigEndian, floatingPointFormat, strictSignOverpunch = strictSignOverpunch, improvedNullDetection = improvedNullDetection, strictIntegralPrecision = strictIntegralPrecision),
       EncoderSelector.getEncoder(pic.value, effectiveEbcdicCodePage, effectiveAsciiCharset)
     )(Some(parent))

@@ -17,6 +17,7 @@
 package za.co.absa.cobrix.cobol.parser.ast
 
 import za.co.absa.cobrix.cobol.parser.ast.datatype.Usage
+import za.co.absa.cobrix.cobol.parser.expression.ExpressionEvaluator
 
 import scala.collection.mutable
 
@@ -57,6 +58,7 @@ case class Group(
                   isFiller: Boolean = false,
                   groupUsage: Option[Usage] = None,
                   nonFillerSize: Int = 0,
+                  ruleExpression: Option[ExpressionEvaluator] = None,
                   binaryProperties: BinaryProperties = BinaryProperties(0, 0, 0)
                 )
                 (val parent: Option[Group] = None)
@@ -81,7 +83,10 @@ case class Group(
   }
 
   /** Returns true if the field is a child segment */
-  def isChildSegment: Boolean = parentSegment.nonEmpty
+  override def isChildSegment: Boolean = parentSegment.nonEmpty
+
+  /** Returns true if the field is enabled for the input binary record. Uses the rule expression to determine that. */
+  override def enabledForRecord(record: Array[Byte]): Boolean = true
 
   /** Returns the original Group with updated children */
   def withUpdatedChildren(newChildren: mutable.ArrayBuffer[Statement]): Group = {
@@ -108,10 +113,16 @@ case class Group(
     copy(parentSegment = newParentSegmentOpt)(parent)
   }
 
+  /** Returns the original field with updated `dependingOnHandlers` */
   def withUpdatedDependingOnHandlers(newDependingOnHandlers: Map[String, Int]): Group = {
     copy(dependingOnHandlers = newDependingOnHandlers)(parent)
   }
 
+  /** Returns the original field with updated `ruleExpression` */
+  def withUpdatedRuleExpression(newRuleExpression: Option[ExpressionEvaluator]): Group = {
+    copy(ruleExpression = newRuleExpression)(parent)
+  }
+
 }
 
 object Group {

@@ -19,6 +19,7 @@ package za.co.absa.cobrix.cobol.parser.ast
 import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, COMP3, CobolType, Decimal, Integral}
 import za.co.absa.cobrix.cobol.parser.decoders.{BinaryUtils, DecoderSelector}
 import za.co.absa.cobrix.cobol.parser.encoding.{ASCII, EBCDIC, EncoderSelector}
+import za.co.absa.cobrix.cobol.parser.expression.ExpressionEvaluator
 
 /** An abstraction of the statements describing fields of primitive data types in the COBOL copybook
   *
@@ -33,6 +34,7 @@ import za.co.absa.cobrix.cobol.parser.encoding.{ASCII, EBCDIC, EncoderSelector}
   * @param dependingOn         A field which specifies size of the array in a record
   * @param dependingOnHandlers A map of handlers for the dependingOn field
   * @param isDependee          A flag indicating if the field is a dependee
+  * @param isUsedInRules       If true, the variable is used in redefine rule expressions
   * @param isFiller            A flag indicating if the field is a filler
   * @param decode              A decoder for the field to convert from raw data to a JVM data type
   * @param encode              An optional encoder for the field to convert from a JVM data type to raw data
@@ -52,7 +54,9 @@ case class Primitive(
                       dependingOn: Option[String] = None,
                       dependingOnHandlers: Map[String, Int] = Map(),
                       isDependee: Boolean = false,
+                      isUsedInRules: Boolean = false,
                       isFiller: Boolean = false,
+                      ruleExpression: Option[ExpressionEvaluator] = None,
                       decode: DecoderSelector.Decoder,
                       encode: Option[EncoderSelector.Encoder],
                       binaryProperties: BinaryProperties = BinaryProperties(0, 0, 0)
@@ -107,7 +111,10 @@ case class Primitive(
   }
 
   /** Returns true if the field is a child segment */
-  def isChildSegment: Boolean = false
+  override def isChildSegment: Boolean = false
+
+  /** Returns true if the field is enabled for the input binary record. Uses the rule expression to determine that. */
+  override def enabledForRecord(record: Array[Byte]): Boolean = true
 
   /** Returns the original field with updated binary properties */
   def withUpdatedBinaryProperties(newBinaryProperties: BinaryProperties): Primitive = {
@@ -124,10 +131,21 @@ case class Primitive(
     copy(isDependee = newIsDependee)(parent)
   }
 
+  /** Returns the original field with updated `isUsedInRules` flag */
+  def withUpdatedIsUsedInRules(newIsUsedInRules: Boolean): Primitive = {
+    copy(isUsedInRules = newIsUsedInRules)(parent)
+  }
+
+  /** Returns the original field with updated `dependingOnHandlers` */
   def withUpdatedDependingOnHandlers(newDependingOnHandlers: Map[String, Int]): Primitive = {
     copy(dependingOnHandlers = newDependingOnHandlers)(parent)
   }
 
+  /** Returns the original field with updated `ruleExpression` */
+  def withUpdatedRuleExpression(newRuleExpression: Option[ExpressionEvaluator]): Primitive = {
+    copy(ruleExpression = newRuleExpression)(parent)
+  }
+
   /** Returns the binary size in bits for the field */
   def getBinarySizeBytes: Int = {
     dataType match {

@@ -16,6 +16,8 @@
 
 package za.co.absa.cobrix.cobol.parser.ast
 
+import za.co.absa.cobrix.cobol.parser.expression.ExpressionEvaluator
+
 /** Trait for Cobol copybook AST element (a statement). */
 trait Statement {
   /** Returns the level of the AST element */
@@ -82,6 +84,12 @@ trait Statement {
   /** Returns true if the field is a child segment */
   def isChildSegment: Boolean
 
+  /** The expression for the field enablement. Usually used for redefined fields. */
+  def ruleExpression: Option[ExpressionEvaluator]
+
+  /** Returns true if the field is enabled for the input binary record. Uses the rule expression to determine that. */
+  def enabledForRecord(record: Array[Byte]): Boolean
+
   /** A binary properties of a field */
   val binaryProperties: BinaryProperties