Cleaner Module

The Cleaner module provides operations for cleaning dirty data, including HTML removal, whitespace normalization, Unicode fixing, and JSON compression.

StripHTML

Remove HTML tags from text, with options to preserve semantic tags or convert to Markdown.

prompt_refiner.cleaner.StripHTML

StripHTML(preserve_tags=None, to_markdown=False)

Bases: Refiner

Remove HTML tags from text, with options to preserve semantic tags or convert to Markdown.

Initialize the HTML stripper.

Parameters:

Name	Type	Description	Default
`preserve_tags`	`Optional[Set[str]]`	Set of tag names to preserve (e.g., {'p', 'li', 'table'})	`None`
`to_markdown`	`bool`	Convert common HTML tags to Markdown syntax	`False`

Source code in src/prompt_refiner/cleaner/html.py

def __init__(
    self,
    preserve_tags: Optional[Set[str]] = None,
    to_markdown: bool = False,
):
    """
    Initialize the HTML stripper.

    Args:
        preserve_tags: Set of tag names to preserve (e.g., {'p', 'li', 'table'})
        to_markdown: Convert common HTML tags to Markdown syntax
    """
    self.preserve_tags = preserve_tags or set()
    self.to_markdown = to_markdown

Functions

process

process(text)

Remove HTML tags from the input text.

Parameters:

Name	Type	Description	Default
`text`	`str`	The input text containing HTML	required

Returns:

Type	Description
`str`	Text with HTML tags removed or converted to Markdown

Source code in src/prompt_refiner/cleaner/html.py

def process(self, text: str) -> str:
    """
    Remove HTML tags from the input text.

    Args:
        text: The input text containing HTML

    Returns:
        Text with HTML tags removed or converted to Markdown
    """
    result = text

    if self.to_markdown:
        # Convert common HTML tags to Markdown
        # Bold
        result = re.sub(r"<strong>(.*?)</strong>", r"**\1**", result, flags=re.DOTALL)
        result = re.sub(r"<b>(.*?)</b>", r"**\1**", result, flags=re.DOTALL)
        # Italic
        result = re.sub(r"<em>(.*?)</em>", r"*\1*", result, flags=re.DOTALL)
        result = re.sub(r"<i>(.*?)</i>", r"*\1*", result, flags=re.DOTALL)
        # Links
        result = re.sub(
            r'<a[^>]*href=["\']([^"\']*)["\'][^>]*>(.*?)</a>',
            r"[\2](\1)",
            result,
            flags=re.DOTALL,
        )
        # Headers
        for i in range(1, 7):
            result = re.sub(
                f"<h{i}[^>]*>(.*?)</h{i}>",
                f"{'#' * i} \\1\n",
                result,
                flags=re.DOTALL,
            )
        # Code
        result = re.sub(r"<code>(.*?)</code>", r"`\1`", result, flags=re.DOTALL)
        # Lists - simple conversion
        result = re.sub(r"<li[^>]*>(.*?)</li>", r"- \1\n", result, flags=re.DOTALL)
        # Paragraphs
        result = re.sub(r"<p[^>]*>(.*?)</p>", r"\1\n\n", result, flags=re.DOTALL)
        # Line breaks
        result = re.sub(r"<br\s*/?>", "\n", result)

    if self.preserve_tags:
        # Remove all tags except preserved ones
        # This is a simplified implementation
        tags_pattern = r"</?(?!" + "|".join(self.preserve_tags) + r"\b)[^>]+>"
        result = re.sub(tags_pattern, "", result)
    else:
        # Remove all HTML tags
        result = re.sub(r"<[^>]+>", "", result)

    # Clean up excessive newlines
    result = re.sub(r"\n{3,}", "\n\n", result)

    return result.strip()

Examples

from prompt_refiner import StripHTML

# Basic HTML stripping
stripper = StripHTML()
result = stripper.process("<p>Hello <b>World</b>!</p>")
# Output: "Hello World!"

# Convert to Markdown
stripper = StripHTML(to_markdown=True)
result = stripper.process("<p>Hello <b>World</b>!</p>")
# Output: "Hello **World**!\n\n"

# Preserve specific tags
stripper = StripHTML(preserve_tags={"p", "div"})
result = stripper.process("<div>Keep <b>Remove</b></div>")
# Output: "<div>Keep Remove</div>"

NormalizeWhitespace

Collapse excessive whitespace, tabs, and newlines into single spaces.

prompt_refiner.cleaner.NormalizeWhitespace

Bases: Refiner

Normalize whitespace in text.

Functions

process

process(text)

Normalize whitespace by collapsing multiple spaces into one.

Parameters:

Name	Type	Description	Default
`text`	`str`	The input text	required

Returns:

Type	Description
`str`	Text with normalized whitespace

Source code in src/prompt_refiner/cleaner/whitespace.py

def process(self, text: str) -> str:
    """
    Normalize whitespace by collapsing multiple spaces into one.

    Args:
        text: The input text

    Returns:
        Text with normalized whitespace
    """
    # Replace multiple whitespace with single space and strip edges
    return " ".join(text.split())

Examples

from prompt_refiner import NormalizeWhitespace

normalizer = NormalizeWhitespace()
result = normalizer.process("Hello    World  \t\n  Foo")
# Output: "Hello World Foo"

FixUnicode

Remove problematic Unicode characters including zero-width spaces and control characters.

prompt_refiner.cleaner.FixUnicode

FixUnicode(
    remove_zero_width=True, remove_control_chars=True
)

Bases: Refiner

Remove or fix problematic Unicode characters.

Initialize the Unicode fixer.

Parameters:

Name	Type	Description	Default
`remove_zero_width`	`bool`	Remove zero-width spaces and similar characters	`True`
`remove_control_chars`	`bool`	Remove control characters (except newlines and tabs)	`True`

Source code in src/prompt_refiner/cleaner/unicode.py

def __init__(self, remove_zero_width: bool = True, remove_control_chars: bool = True):
    """
    Initialize the Unicode fixer.

    Args:
        remove_zero_width: Remove zero-width spaces and similar characters
        remove_control_chars: Remove control characters (except newlines and tabs)
    """
    self.remove_zero_width = remove_zero_width
    self.remove_control_chars = remove_control_chars

Functions

process

process(text)

Clean problematic Unicode characters from text.

Parameters:

Name	Type	Description	Default
`text`	`str`	The input text	required

Returns:

Type	Description
`str`	Text with problematic Unicode characters removed

Source code in src/prompt_refiner/cleaner/unicode.py

def process(self, text: str) -> str:
    """
    Clean problematic Unicode characters from text.

    Args:
        text: The input text

    Returns:
        Text with problematic Unicode characters removed
    """
    result = text

    if self.remove_zero_width:
        # Remove zero-width characters
        zero_width_chars = [
            "\u200b",  # Zero-width space
            "\u200c",  # Zero-width non-joiner
            "\u200d",  # Zero-width joiner
            "\ufeff",  # Zero-width no-break space (BOM)
            "\u2060",  # Word joiner
        ]
        for char in zero_width_chars:
            result = result.replace(char, "")

    if self.remove_control_chars:
        # Remove control characters except newlines, tabs, and carriage returns
        # Keep: \n (0x0A), \t (0x09), \r (0x0D)
        result = "".join(
            char
            for char in result
            if not unicodedata.category(char).startswith("C") or char in ("\n", "\t", "\r")
        )

    # Normalize Unicode to NFC form (canonical composition)
    result = unicodedata.normalize("NFC", result)

    return result

Examples

from prompt_refiner import FixUnicode

# Remove zero-width spaces and control chars
fixer = FixUnicode()
result = fixer.process("Hello\u200bWorld\u0000")
# Output: "HelloWorld"

# Only remove zero-width spaces
fixer = FixUnicode(remove_control_chars=False)
result = fixer.process("Hello\u200bWorld")
# Output: "HelloWorld"

JsonCleaner

Clean and minify JSON by removing null values and empty containers.

prompt_refiner.cleaner.JsonCleaner

JsonCleaner(strip_nulls=True, strip_empty=True)

Bases: Refiner

Cleans and minifies JSON strings. Removes null values, empty containers, and extra whitespace.

Parameters:

Name	Type	Description	Default
`strip_nulls`	`bool`	If True, remove null/None values from objects and arrays (default: True)	`True`
`strip_empty`	`bool`	If True, remove empty dicts, lists, and strings (default: True)	`True`

Example

from prompt_refiner import JsonCleaner cleaner = JsonCleaner(strip_nulls=True, strip_empty=True)

dirty_json = ''' ... { ... "name": "Alice", ... "age": null, ... "address": {}, ... "tags": [], ... "bio": "" ... } ... ''' result = cleaner.run(dirty_json) print(result)

Use Cases

RAG Context Compression: Strip nulls/empties from API responses before feeding to LLM
Cost Optimization: Reduce token count by removing unnecessary JSON structure
Data Cleaning: Normalize JSON from multiple sources with inconsistent null handling

Initialize JSON cleaner.

Parameters:

Name	Type	Description	Default
`strip_nulls`	`bool`	Remove null/None values	`True`
`strip_empty`	`bool`	Remove empty containers (dict, list, str)	`True`

Source code in src/prompt_refiner/cleaner/json.py

def __init__(self, strip_nulls: bool = True, strip_empty: bool = True):
    """
    Initialize JSON cleaner.

    Args:
        strip_nulls: Remove null/None values
        strip_empty: Remove empty containers (dict, list, str)
    """
    self.strip_nulls = strip_nulls
    self.strip_empty = strip_empty

Functions

process

process(text)

Process the input JSON (string or object). Returns a minified JSON string.

Parameters:

Name	Type	Description	Default
`text`	`Union[str, Dict, List]`	JSON string, dict, or list to clean	required

Returns:

Type	Description
`str`	Minified JSON string with nulls/empties removed

Note

If input is not valid JSON, returns input unchanged.

Source code in src/prompt_refiner/cleaner/json.py

def process(self, text: Union[str, Dict, List]) -> str:
    """
    Process the input JSON (string or object).
    Returns a minified JSON string.

    Args:
        text: JSON string, dict, or list to clean

    Returns:
        Minified JSON string with nulls/empties removed

    Note:
        If input is not valid JSON, returns input unchanged.
    """
    # 1. Parse Input (Handle both string JSON and raw Dict/List)
    data = text
    if isinstance(text, str):
        try:
            data = json.loads(text)
        except json.JSONDecodeError:
            # If it's not valid JSON, return as-is to allow pipeline to continue safely
            return text

    # 2. Clean Structure
    cleaned_data = self._clean_data(data)

    # 3. Dump Minified String (No whitespace)
    return json.dumps(cleaned_data, ensure_ascii=False, separators=(",", ":"))

Examples

from prompt_refiner import JsonCleaner

# Strip nulls and empty containers
cleaner = JsonCleaner(strip_nulls=True, strip_empty=True)
dirty_json = """
{
    "name": "Alice",
    "age": null,
    "address": {},
    "tags": [],
    "bio": ""
}
"""
result = cleaner.process(dirty_json)
# Output: {"name":"Alice"}

# Only strip nulls, keep empties
cleaner = JsonCleaner(strip_nulls=True, strip_empty=False)
result = cleaner.process(dirty_json)
# Output: {"name":"Alice","address":{},"tags":[],"bio":""}

# Only minify (no cleaning)
cleaner = JsonCleaner(strip_nulls=False, strip_empty=False)
result = cleaner.process(dirty_json)
# Output: {"name":"Alice","age":null,"address":{},"tags":[],"bio":""}

# Works with dict/list inputs too
cleaner = JsonCleaner(strip_nulls=True, strip_empty=True)
data = {"name": "Bob", "tags": [], "age": None}
result = cleaner.process(data)
# Output: {"name":"Bob"}

Common Use Cases

Web Scraping

from prompt_refiner import Refiner, StripHTML, NormalizeWhitespace, FixUnicode

web_cleaner = (
    Refiner()
    .pipe(StripHTML(to_markdown=True))
    .pipe(FixUnicode())
    .pipe(NormalizeWhitespace())
)

Text Normalization

from prompt_refiner import Refiner, NormalizeWhitespace, FixUnicode

text_normalizer = (
    Refiner()
    .pipe(FixUnicode())
    .pipe(NormalizeWhitespace())
)

RAG JSON Compression

from prompt_refiner import Refiner, JsonCleaner, TruncateTokens

rag_compressor = (
    Refiner()
    .pipe(JsonCleaner(strip_nulls=True, strip_empty=True))
    .pipe(TruncateTokens(max_tokens=500, strategy="head"))
)