Skip to content

Cleaner Module

The Cleaner module provides operations for cleaning dirty data, including HTML removal, whitespace normalization, Unicode fixing, and JSON compression.

StripHTML

Remove HTML tags from text, with options to preserve semantic tags or convert to Markdown.

prompt_refiner.cleaner.StripHTML

StripHTML(preserve_tags=None, to_markdown=False)

Bases: Refiner

Remove HTML tags from text, with options to preserve semantic tags or convert to Markdown.

Initialize the HTML stripper.

Parameters:

Name Type Description Default
preserve_tags Optional[Set[str]]

Set of tag names to preserve (e.g., {'p', 'li', 'table'})

None
to_markdown bool

Convert common HTML tags to Markdown syntax

False
Source code in src/prompt_refiner/cleaner/html.py
def __init__(
    self,
    preserve_tags: Optional[Set[str]] = None,
    to_markdown: bool = False,
):
    """
    Initialize the HTML stripper.

    Args:
        preserve_tags: Set of tag names to preserve (e.g., {'p', 'li', 'table'})
        to_markdown: Convert common HTML tags to Markdown syntax
    """
    self.preserve_tags = preserve_tags or set()
    self.to_markdown = to_markdown

Functions

process
process(text)

Remove HTML tags from the input text.

Parameters:

Name Type Description Default
text str

The input text containing HTML

required

Returns:

Type Description
str

Text with HTML tags removed or converted to Markdown

Source code in src/prompt_refiner/cleaner/html.py
def process(self, text: str) -> str:
    """
    Remove HTML tags from the input text.

    Args:
        text: The input text containing HTML

    Returns:
        Text with HTML tags removed or converted to Markdown
    """
    result = text

    if self.to_markdown:
        # Convert common HTML tags to Markdown
        # Bold
        result = re.sub(r"<strong>(.*?)</strong>", r"**\1**", result, flags=re.DOTALL)
        result = re.sub(r"<b>(.*?)</b>", r"**\1**", result, flags=re.DOTALL)
        # Italic
        result = re.sub(r"<em>(.*?)</em>", r"*\1*", result, flags=re.DOTALL)
        result = re.sub(r"<i>(.*?)</i>", r"*\1*", result, flags=re.DOTALL)
        # Links
        result = re.sub(
            r'<a[^>]*href=["\']([^"\']*)["\'][^>]*>(.*?)</a>',
            r"[\2](\1)",
            result,
            flags=re.DOTALL,
        )
        # Headers
        for i in range(1, 7):
            result = re.sub(
                f"<h{i}[^>]*>(.*?)</h{i}>",
                f"{'#' * i} \\1\n",
                result,
                flags=re.DOTALL,
            )
        # Code
        result = re.sub(r"<code>(.*?)</code>", r"`\1`", result, flags=re.DOTALL)
        # Lists - simple conversion
        result = re.sub(r"<li[^>]*>(.*?)</li>", r"- \1\n", result, flags=re.DOTALL)
        # Paragraphs
        result = re.sub(r"<p[^>]*>(.*?)</p>", r"\1\n\n", result, flags=re.DOTALL)
        # Line breaks
        result = re.sub(r"<br\s*/?>", "\n", result)

    if self.preserve_tags:
        # Remove all tags except preserved ones
        # This is a simplified implementation
        tags_pattern = r"</?(?!" + "|".join(self.preserve_tags) + r"\b)[^>]+>"
        result = re.sub(tags_pattern, "", result)
    else:
        # Remove all HTML tags
        result = re.sub(r"<[^>]+>", "", result)

    # Clean up excessive newlines
    result = re.sub(r"\n{3,}", "\n\n", result)

    return result.strip()

Examples

from prompt_refiner import StripHTML

# Basic HTML stripping
stripper = StripHTML()
result = stripper.process("<p>Hello <b>World</b>!</p>")
# Output: "Hello World!"

# Convert to Markdown
stripper = StripHTML(to_markdown=True)
result = stripper.process("<p>Hello <b>World</b>!</p>")
# Output: "Hello **World**!\n\n"

# Preserve specific tags
stripper = StripHTML(preserve_tags={"p", "div"})
result = stripper.process("<div>Keep <b>Remove</b></div>")
# Output: "<div>Keep Remove</div>"

NormalizeWhitespace

Collapse excessive whitespace, tabs, and newlines into single spaces.

prompt_refiner.cleaner.NormalizeWhitespace

Bases: Refiner

Normalize whitespace in text.

Functions

process
process(text)

Normalize whitespace by collapsing multiple spaces into one.

Parameters:

Name Type Description Default
text str

The input text

required

Returns:

Type Description
str

Text with normalized whitespace

Source code in src/prompt_refiner/cleaner/whitespace.py
def process(self, text: str) -> str:
    """
    Normalize whitespace by collapsing multiple spaces into one.

    Args:
        text: The input text

    Returns:
        Text with normalized whitespace
    """
    # Replace multiple whitespace with single space and strip edges
    return " ".join(text.split())

Examples

from prompt_refiner import NormalizeWhitespace

normalizer = NormalizeWhitespace()
result = normalizer.process("Hello    World  \t\n  Foo")
# Output: "Hello World Foo"

FixUnicode

Remove problematic Unicode characters including zero-width spaces and control characters.

prompt_refiner.cleaner.FixUnicode

FixUnicode(
    remove_zero_width=True, remove_control_chars=True
)

Bases: Refiner

Remove or fix problematic Unicode characters.

Initialize the Unicode fixer.

Parameters:

Name Type Description Default
remove_zero_width bool

Remove zero-width spaces and similar characters

True
remove_control_chars bool

Remove control characters (except newlines and tabs)

True
Source code in src/prompt_refiner/cleaner/unicode.py
def __init__(self, remove_zero_width: bool = True, remove_control_chars: bool = True):
    """
    Initialize the Unicode fixer.

    Args:
        remove_zero_width: Remove zero-width spaces and similar characters
        remove_control_chars: Remove control characters (except newlines and tabs)
    """
    self.remove_zero_width = remove_zero_width
    self.remove_control_chars = remove_control_chars

Functions

process
process(text)

Clean problematic Unicode characters from text.

Parameters:

Name Type Description Default
text str

The input text

required

Returns:

Type Description
str

Text with problematic Unicode characters removed

Source code in src/prompt_refiner/cleaner/unicode.py
def process(self, text: str) -> str:
    """
    Clean problematic Unicode characters from text.

    Args:
        text: The input text

    Returns:
        Text with problematic Unicode characters removed
    """
    result = text

    if self.remove_zero_width:
        # Remove zero-width characters
        zero_width_chars = [
            "\u200b",  # Zero-width space
            "\u200c",  # Zero-width non-joiner
            "\u200d",  # Zero-width joiner
            "\ufeff",  # Zero-width no-break space (BOM)
            "\u2060",  # Word joiner
        ]
        for char in zero_width_chars:
            result = result.replace(char, "")

    if self.remove_control_chars:
        # Remove control characters except newlines, tabs, and carriage returns
        # Keep: \n (0x0A), \t (0x09), \r (0x0D)
        result = "".join(
            char
            for char in result
            if not unicodedata.category(char).startswith("C") or char in ("\n", "\t", "\r")
        )

    # Normalize Unicode to NFC form (canonical composition)
    result = unicodedata.normalize("NFC", result)

    return result

Examples

from prompt_refiner import FixUnicode

# Remove zero-width spaces and control chars
fixer = FixUnicode()
result = fixer.process("Hello\u200bWorld\u0000")
# Output: "HelloWorld"

# Only remove zero-width spaces
fixer = FixUnicode(remove_control_chars=False)
result = fixer.process("Hello\u200bWorld")
# Output: "HelloWorld"

JsonCleaner

Clean and minify JSON by removing null values and empty containers.

prompt_refiner.cleaner.JsonCleaner

JsonCleaner(strip_nulls=True, strip_empty=True)

Bases: Refiner

Cleans and minifies JSON strings. Removes null values, empty containers, and extra whitespace.

Parameters:

Name Type Description Default
strip_nulls bool

If True, remove null/None values from objects and arrays (default: True)

True
strip_empty bool

If True, remove empty dicts, lists, and strings (default: True)

True
Example

from prompt_refiner import JsonCleaner cleaner = JsonCleaner(strip_nulls=True, strip_empty=True)

dirty_json = ''' ... { ... "name": "Alice", ... "age": null, ... "address": {}, ... "tags": [], ... "bio": "" ... } ... ''' result = cleaner.run(dirty_json) print(result)

Use Cases
  • RAG Context Compression: Strip nulls/empties from API responses before feeding to LLM
  • Cost Optimization: Reduce token count by removing unnecessary JSON structure
  • Data Cleaning: Normalize JSON from multiple sources with inconsistent null handling

Initialize JSON cleaner.

Parameters:

Name Type Description Default
strip_nulls bool

Remove null/None values

True
strip_empty bool

Remove empty containers (dict, list, str)

True
Source code in src/prompt_refiner/cleaner/json.py
def __init__(self, strip_nulls: bool = True, strip_empty: bool = True):
    """
    Initialize JSON cleaner.

    Args:
        strip_nulls: Remove null/None values
        strip_empty: Remove empty containers (dict, list, str)
    """
    self.strip_nulls = strip_nulls
    self.strip_empty = strip_empty

Functions

process
process(text)

Process the input JSON (string or object). Returns a minified JSON string.

Parameters:

Name Type Description Default
text Union[str, Dict, List]

JSON string, dict, or list to clean

required

Returns:

Type Description
str

Minified JSON string with nulls/empties removed

Note

If input is not valid JSON, returns input unchanged.

Source code in src/prompt_refiner/cleaner/json.py
def process(self, text: Union[str, Dict, List]) -> str:
    """
    Process the input JSON (string or object).
    Returns a minified JSON string.

    Args:
        text: JSON string, dict, or list to clean

    Returns:
        Minified JSON string with nulls/empties removed

    Note:
        If input is not valid JSON, returns input unchanged.
    """
    # 1. Parse Input (Handle both string JSON and raw Dict/List)
    data = text
    if isinstance(text, str):
        try:
            data = json.loads(text)
        except json.JSONDecodeError:
            # If it's not valid JSON, return as-is to allow pipeline to continue safely
            return text

    # 2. Clean Structure
    cleaned_data = self._clean_data(data)

    # 3. Dump Minified String (No whitespace)
    return json.dumps(cleaned_data, ensure_ascii=False, separators=(",", ":"))

Examples

from prompt_refiner import JsonCleaner

# Strip nulls and empty containers
cleaner = JsonCleaner(strip_nulls=True, strip_empty=True)
dirty_json = """
{
    "name": "Alice",
    "age": null,
    "address": {},
    "tags": [],
    "bio": ""
}
"""
result = cleaner.process(dirty_json)
# Output: {"name":"Alice"}

# Only strip nulls, keep empties
cleaner = JsonCleaner(strip_nulls=True, strip_empty=False)
result = cleaner.process(dirty_json)
# Output: {"name":"Alice","address":{},"tags":[],"bio":""}

# Only minify (no cleaning)
cleaner = JsonCleaner(strip_nulls=False, strip_empty=False)
result = cleaner.process(dirty_json)
# Output: {"name":"Alice","age":null,"address":{},"tags":[],"bio":""}

# Works with dict/list inputs too
cleaner = JsonCleaner(strip_nulls=True, strip_empty=True)
data = {"name": "Bob", "tags": [], "age": None}
result = cleaner.process(data)
# Output: {"name":"Bob"}

Common Use Cases

Web Scraping

from prompt_refiner import Refiner, StripHTML, NormalizeWhitespace, FixUnicode

web_cleaner = (
    Refiner()
    .pipe(StripHTML(to_markdown=True))
    .pipe(FixUnicode())
    .pipe(NormalizeWhitespace())
)

Text Normalization

from prompt_refiner import Refiner, NormalizeWhitespace, FixUnicode

text_normalizer = (
    Refiner()
    .pipe(FixUnicode())
    .pipe(NormalizeWhitespace())
)

RAG JSON Compression

from prompt_refiner import Refiner, JsonCleaner, TruncateTokens

rag_compressor = (
    Refiner()
    .pipe(JsonCleaner(strip_nulls=True, strip_empty=True))
    .pipe(TruncateTokens(max_tokens=500, strategy="head"))
)