open2c · ShigrafS · Apr 20, 2025 · Jun 29, 2025 · agalitsyna · Jun 30, 2025
diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py
@@ -383,7 +383,6 @@ def dedup_py(
     n_proc,
     **kwargs,
 ):
-
     sep = ast.literal_eval('"""' + sep + '"""')
     send_header_to_dedup = send_header_to in ["both", "dedup"]
     send_header_to_dup = send_header_to in ["both", "dups"]
@@ -485,6 +484,34 @@ def dedup_py(
         logger.warning(
             "Pairs file appears not to be sorted, dedup might produce wrong results."
         )
+
+    # Canonicalize column names for flexible matching
+    column_names = headerops.extract_column_names(header)
+    column_names = headerops.canonicalize_columns(column_names)
+
+    # Get column indices with fallbacks
+    try:
+        col1 = headerops.get_column_index(column_names, c1)
+        col2 = headerops.get_column_index(column_names, c2)
+        colp1 = headerops.get_column_index(column_names, p1)
+        colp2 = headerops.get_column_index(column_names, p2)
+        cols1 = headerops.get_column_index(column_names, s1)
+        cols2 = headerops.get_column_index(column_names, s2)
+
+        # Handle extra column pairs
+        extra_cols1 = []
+        extra_cols2 = []
+        if extra_col_pair is not None:
+            for col1_spec, col2_spec in extra_col_pair:
+                try:
+                    extra_cols1.append(headerops.get_column_index(column_names, col1_spec))
+                    extra_cols2.append(headerops.get_column_index(column_names, col2_spec))
+                except ValueError:
+                    logger.warning(f"Extra column pair ({col1_spec}, {col2_spec}) not found in header, skipping")
+                    continue
+    except ValueError as e:
+        raise ValueError(f"Column error: {str(e)}") from e
+
     header = headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)
     dups_header = header.copy()
     if keep_parent_id and len(dups_header) > 0:
@@ -502,38 +529,17 @@ def dedup_py(
     ):
         outstream_unmapped.writelines((l + "\n" for l in header))
 
-    column_names = headerops.extract_column_names(header)
-    extra_cols1 = []
-    extra_cols2 = []
-    if extra_col_pair is not None:
-        for col1, col2 in extra_col_pair:
-            extra_cols1.append(column_names[col1] if col1.isnumeric() else col1)
-            extra_cols2.append(column_names[col2] if col2.isnumeric() else col2)
-
     if backend == "cython":
-        # warnings.warn(
-        #     "'cython' backend is deprecated and provided only"
-        #     " for backwards compatibility",
-        #     DeprecationWarning,
-        # )
-        extra_cols1 = [column_names.index(col) for col in extra_cols1]
-        extra_cols2 = [column_names.index(col) for col in extra_cols2]
-        c1 = column_names.index(c1)
-        c2 = column_names.index(c2)
-        p1 = column_names.index(p1)
-        p2 = column_names.index(p2)
-        s1 = column_names.index(s1)
-        s2 = column_names.index(s2)
         streaming_dedup_cython(
             method,
             max_mismatch,
             sep,
-            c1,
-            c2,
-            p1,
-            p2,
-            s1,
-            s2,
+            col1,
+            col2,
+            colp1,
+            colp2,
+            cols1,
+            cols2,
             extra_cols1,
             extra_cols2,
             unmapped_chrom,
@@ -554,7 +560,7 @@ def dedup_py(
             method=method,
             mark_dups=mark_dups,
             max_mismatch=max_mismatch,
-            extra_col_pairs=list(extra_col_pair),
+            extra_col_pairs=list(zip(extra_cols1, extra_cols2)) if extra_cols1 else [],
             keep_parent_id=keep_parent_id,
             unmapped_chrom=unmapped_chrom,
             outstream=outstream,
@@ -563,12 +569,12 @@ def dedup_py(
             out_stat=out_stat,
             backend=backend,
             n_proc=n_proc,
-            c1=c1,
-            c2=c2,
-            p1=p1,
-            p2=p2,
-            s1=s1,
-            s2=s2,
+            c1=col1,
+            c2=col2,
+            p1=colp1,
+            p2=colp2,
+            s1=cols1,
+            s2=cols2,
         )
     else:
         raise ValueError("Unknown backend")

diff --git a/pairtools/cli/sort.py b/pairtools/cli/sort.py
@@ -6,10 +6,12 @@
 import subprocess
 import shutil
 import warnings
+from .._logging import get_logger
 
 from ..lib import fileio, pairsam_format, headerops
 from . import cli, common_io_options
 
+logger = get_logger()
 UTIL_NAME = "pairtools_sort"
 
 
@@ -58,6 +60,7 @@
     default=pairsam_format.COLUMNS_PAIRS[7],
     help=f"Pair type column; default {pairsam_format.COLUMNS_PAIRS[7]}"
     "[input format option]",
+    required=False,
 )
 @click.option(
     "--extra-col",
@@ -157,7 +160,6 @@ def sort_py(
     compress_program,
     **kwargs,
 ):
-
     instream = fileio.auto_open(
         pairs_path,
         mode="r",
@@ -176,7 +178,6 @@ def sort_py(
     header = headerops.mark_header_as_sorted(header)
 
     outstream.writelines((l + "\n" for l in header))
-
     outstream.flush()
 
     if compress_program == "auto":
@@ -191,16 +192,42 @@ def sort_py(
             compress_program = "gzip"
 
     column_names = headerops.extract_column_names(header)
-    columns = [c1, c2, p1, p2, pt] + list(extra_col)
-    # Now generating the "-k <i>,<i><mode>" expressions for all columns.
-    # If column name is in the default pairsam format and has an integer dtype there, do numerical sorting
+    column_names = headerops.canonicalize_columns(column_names)
+
+    # Get column indices with fallbacks
+    try:
+        col1 = headerops.get_column_index(column_names, c1)
+        col2 = headerops.get_column_index(column_names, c2)
+        colp1 = headerops.get_column_index(column_names, p1)
+        colp2 = headerops.get_column_index(column_names, p2)
+
+        # Make pair_type optional
+        try:
+            colpt = headerops.get_column_index(column_names, pt) if pt else None
+        except ValueError:
+            colpt = None
+
+        extra_cols = []
+        for col in extra_col:
+            try:
+                extra_cols.append(headerops.get_column_index(column_names, col))
+            except ValueError:
+                logger.warning(f"Extra column {col} not found in header, skipping")
+                continue
+    except ValueError as e:
+        raise ValueError(f"Column error: {str(e)}") from e
+
+    # Generate sort command columns
     cols = []
-    for col in columns:
-        colindex = int(col) if col.isnumeric() else column_names.index(col) + 1
+    for i, col in enumerate([col1, colp1, col2, colp2, colpt] + extra_cols):
+        if col is None:
+            continue  # Skip optional columns that weren't found
+        dtype = pairsam_format.DTYPES_PAIRSAM.get(column_names[col], str)
         cols.append(
-            f"-k {colindex},{colindex}{'n' if issubclass(pairsam_format.DTYPES_PAIRSAM.get(column_names[colindex-1], str), int) else ''}"
+            f"-k {col+1},{col+1}{'n' if issubclass(dtype, int) else ''}"
         )
     cols = " ".join(cols)
+
     command = rf"""
         /bin/bash -c 'export LC_COLLATE=C; export LANG=C; sort 
         {cols}
@@ -210,9 +237,8 @@ def sort_py(
         {f'--temporary-directory={tmpdir}' if tmpdir else ''}
         -S {memory}
         {f'--compress-program={compress_program}' if compress_program else ''}'
-        """.replace(
-        "\n", " "
-    )
+        """.replace("\n", " ")
+
     with subprocess.Popen(
         command, stdin=subprocess.PIPE, bufsize=-1, shell=True, stdout=outstream
     ) as process:
@@ -224,7 +250,6 @@ def sort_py(
 
     if instream != sys.stdin:
         instream.close()
-
     if outstream != sys.stdout:
         outstream.close()
 

diff --git a/pairtools/lib/headerops.py b/pairtools/lib/headerops.py
@@ -22,6 +22,51 @@
 
 
 
+def canonicalize_columns(columns):
+    """Convert between common column name variants."""
+    canonical_map = {
+        'chr1': 'chrom1',
+        'chr2': 'chrom2',
+        'chrom1': 'chrom1',  # Ensure identity mapping
+        'chrom2': 'chrom2',
+        'pt': 'pair_type',
+        'pair_type': 'pair_type'
+    }
+    return [canonical_map.get(col.lower(), col) for col in columns]
+
+def get_column_index(column_names, column_spec):
+    """Get column index with flexible name matching."""
+    if isinstance(column_spec, int):
+        if -len(column_names) <= column_spec < len(column_names):
+            return column_spec % len(column_names)  # Handle negative indices
+        raise ValueError(f"Column index {column_spec} out of range")
+
+    if not isinstance(column_spec, (str, int)):
+        raise AttributeError(f"Column spec must be string or integer, got {type(column_spec)}")
+
+    # Try direct match first
+    try:
+        return column_names.index(column_spec)
+    except ValueError:
+        pass
+
+    # Try canonical name
+    canonical = canonicalize_columns([column_spec])[0]
+    try:
+        return column_names.index(canonical)
+    except ValueError:
+        pass
+
+    # Try case-insensitive
+    lower_columns = [c.lower() for c in column_names]
+    try:
+        return lower_columns.index(canonical.lower())
+    except ValueError:
+        available = ', '.join(f"'{c}'" for c in column_names)
+        raise ValueError(
+            f"Column '{column_spec}' not found. Available columns: {available}"
+        )
+
 def get_header(instream, comment_char=COMMENT_CHAR, ignore_warning=False):
     """Returns a header from the stream and an the reaminder of the stream
     with the actual data.

diff --git a/tests/test_headerops.py b/tests/test_headerops.py
@@ -4,6 +4,104 @@
 import pytest
 
 
+import pytest
+from pairtools.lib.headerops import (
+    canonicalize_columns,
+    get_column_index,
+    extract_column_names,
+)
+
+def test_canonicalize_columns():
+    # Test basic canonicalization
+    assert canonicalize_columns(['chr1', 'chr2']) == ['chrom1', 'chrom2']
+    assert canonicalize_columns(['chrom1', 'chrom2']) == ['chrom1', 'chrom2']
+    assert canonicalize_columns(['pt', 'other']) == ['pair_type', 'other']
+
+    # Test mixed case
+    assert canonicalize_columns(['Chr1', 'CHR2']) == ['chrom1', 'chrom2']
+    assert canonicalize_columns(['CHR1', 'chr2']) == ['chrom1', 'chrom2']
+
+    # Test no changes needed
+    assert canonicalize_columns(['readID', 'pos1']) == ['readID', 'pos1']
+
+    # Test empty input
+    assert canonicalize_columns([]) == []
+
+    # Test all known aliases
+    assert canonicalize_columns(['chr1', 'Chr2', 'PT']) == ['chrom1', 'chrom2', 'pair_type']
+
+def test_get_column_index():
+    # Setup test columns
+    columns = ['readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', 'strand2', 'pair_type']
+
+    # Test string lookup - direct matches
+    assert get_column_index(columns, 'chr1') == 1
+    assert get_column_index(columns, 'pos2') == 4
+    assert get_column_index(columns, 'pair_type') == 7
+
+    # Test string lookup - canonicalized matches
+    assert get_column_index(columns, 'chrom1') == 1
+    assert get_column_index(columns, 'CHROM2') == 3
+    assert get_column_index(columns, 'PT') == 7
+
+    # Test case insensitive matches
+    assert get_column_index(columns, 'CHR1') == 1
+    assert get_column_index(columns, 'ChR2') == 3
+
+    # Test integer lookup
+    assert get_column_index(columns, 0) == 0
+    assert get_column_index(columns, 3) == 3
+    assert get_column_index(columns, 7) == 7
+
+    # Test error cases
+    with pytest.raises(ValueError, match="Column 'nonexistent' not found"):
+        get_column_index(columns, 'nonexistent')
+
+    with pytest.raises(ValueError, match="Column index 100 out of range"):
+        get_column_index(columns, 100)
+
+    with pytest.raises(AttributeError, match="Column spec must be string or integer"):
+        get_column_index(columns, 3.14)
+
+def test_integration_with_extract_column_names():
+    # Test with actual header format
+    header = [
+        "## pairs format v1.0",
+        "#columns: readID chr1 pos1 chr2 pos2 strand1 strand2 pair_type",
+        "#chromsize: chr1 1000",
+        "#chromsize: chr2 800"
+    ]
+
+    columns = extract_column_names(header)
+    assert columns == ['readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', 'strand2', 'pair_type']
+
+    # Test canonicalized column access
+    assert get_column_index(columns, 'chrom1') == 1
+    assert get_column_index(columns, 'chrom2') == 3
+    assert get_column_index(columns, 'pt') == 7
+
+    # Test with alternative header format
+    header2 = [
+        "## pairs format v1.0",
+        "#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type",
+    ]
+    columns2 = extract_column_names(header2)
+    assert get_column_index(columns2, 'chr1') == 1
+    assert get_column_index(columns2, 'chr2') == 3
+
+def test_edge_cases():
+    # Test empty columns
+    with pytest.raises(ValueError):
+        get_column_index([], 'chrom1')
+
+    # Test invalid column spec type
+    with pytest.raises(AttributeError):
+        get_column_index(['a', 'b', 'c'], 3.14)  # float not supported
+
+    # Test negative indices
+    assert get_column_index(['a', 'b', 'c'], -1) == 2  # Python-style negative indexing
+
+
 def test_make_standard_header():
     header = headerops.make_standard_pairsheader()