lexicon yummy!

728x90

def initialize_grammar():

"""

Initialize the CNF grammar with probabilities.

Returns:

binary_rules: dict mapping non-terminals to a list of (rhs, probability) pairs

for binary rules (e.g., S -> NP VP).

lexical_rules: dict mapping non-terminals to a dict of terminal probabilities

(e.g., NN -> {"flight": 0.5, "cat": 0.5}).

"""

# Binary (non-terminal) rules with probabilities

binary_rules = {

"S": [ (["NP", "VP"], 0.3), (["NN", "VP"], 0.2), (["PRP", "VBD"], 0.2), (["NN", "VBD"], 0.3) ],

"NP": [ (["DT", "NN"], 0.1) ],

"VP": [ (["VBD", "NP"], 0.6), (["VBD", "PP"], 0.4) ],

"PP": [ (["IN", "NP"], 1.0) ]

}

# Lexical (terminal) rules with probabilities

lexical_rules = {

"DT": {"the": 1.0},

"NN": {"flight": 0.5, "cat": 0.5},

"PRP": {"She": 1.0},

"VBD": {"left": 0.6, "saw": 0.4},

"IN": {"to": 1.0}

}

return binary_rules, lexical_rules

def tokenize_sentence(sentence):

"""

Tokenize the input sentence.

For simplicity, we convert to lowercase and split by whitespace.

Adjust as needed to handle punctuation or case sensitivity based on your grammar.

Args:

sentence (str): The input sentence.

Returns:

List[str]: A list of tokens.

"""

# TODO: Consider handling punctuation or case sensitivity as required.

return sentence.lower().split()

def initialize_cyk_table(tokens, lexical_rules):

"""

Initialize the CYK table (a 2D list) for the tokens.

The table[i][j] will eventually hold the set of non-terminals that derive the substring tokens[i:j+1].

Args:

tokens (List[str]): Tokenized sentence.

lexical_rules (dict): The lexical rules of the grammar.

Returns:

list: A 2D list representing the CYK table.

"""

n = len(tokens)

table = [[set() for _ in range(n)] for _ in range(n)]

# Fill the diagonal with non-terminals that produce the token via lexical rules.

for i, token in enumerate(tokens):

for lhs, productions in lexical_rules.items():

# TODO: Check if token is produced by lhs and add lhs to table[i][i]

if token in productions:

table[i][i].add(lhs)

return table

def fill_cyk_table(tokens, table, binary_rules):

"""

Fill the CYK table using the binary production rules.

Use dynamic programming to combine subparts of the sentence.

Args:

tokens (List[str]): Tokenized sentence.

table (list): The current CYK table with initialized lexical entries.

binary_rules (dict): The binary rules of the grammar.

Returns:

list: The completed CYK table.

"""

n = len(tokens)

# Loop over lengths of substrings

for span in range(2, n+1): # span length from 2 to n

for start in range(n - span + 1):

end = start + span - 1

# Consider all possible splits

for split in range(start, end):

left_cell = table[start][split]

right_cell = table[split+1][end]

# TODO: For every combination in left_cell and right_cell,

# check binary_rules to see if they can combine to form a new non-terminal.

for lhs, productions in binary_rules.items():

for (rhs, prob) in productions:

# Check if rhs[0] is in left_cell and rhs[1] in right_cell.

if rhs[0] in left_cell and rhs[1] in right_cell:

table[start][end].add(lhs)

return table

def is_sentence_valid(table):

"""

Determine if the sentence is valid by checking if the start symbol 'S'

appears in the top-right cell of the CYK table.

Args:

table (list): The filled CYK table.

Returns:

bool: True if 'S' is present, False otherwise.

"""

n = len(table)

return 'S' in table[0][n-1]

def print_cyk_table(sentence, table, tokens):

n = len(tokens)

# Print the header for the table with tokens centered in fixed-width fields.

header = " " * 12 # Initial space for the row labels.

for token in tokens:

header += f"{token:^10}"

print("CYK Table:")

print(header)

print("-" * len(header))

# Print each row of the CYK table.

for i in range(n):

# Use the token at position i as the row label.

row_label = tokens[i] if i < len(tokens) else ""

row_str = f"{row_label:<12}" # Left-align the row label within 12 spaces.

for j in range(n):

if j < i:

# For lower triangle cells, print fixed-width blank space.

row_str += " " * 10

else:

# Convert the set to a sorted list for consistency.

cell = sorted(list(table[i][j]))

cell_str = str(cell)

row_str += f"{cell_str:^10}" # Center-align the cell contents.

row_str += " " # Extra space between cells.

print(row_str)

def main():

# Step 1: Initialize grammar rules

binary_rules, lexical_rules = initialize_grammar()

# Step 2: Input sentence and tokenize

sentence = "Cat saw the flight"

tokens = tokenize_sentence(sentence)

# Step 3: Initialize the CYK table with lexical rules

table = initialize_cyk_table(tokens, lexical_rules)

# Step 4: Fill the CYK table using binary rules (dynamic programming)

table = fill_cyk_table(tokens, table, binary_rules)

# Step 5: Check if the sentence is valid according to the grammar

valid = is_sentence_valid(table)

# Output the result

print(f"Sentence: '{sentence}'")

print("Valid sentence according to the grammar:" if valid else "Invalid sentence according to the grammar.")

print_cyk_table(sentence, table, tokens)

if __name__ == "__main__":

main()

728x90

저작자표시

'hacking sorcerer' 카테고리의 다른 글

Crypto exchange security whitepaper and Mitre ATT&CK; for bybit hacking (0)	2025.03.09
better_than_pil_in_one_second (0)	2025.03.07
the sticker shop in tryhackme (0)	2025.03.04
good morning ! (0)	2025.03.03
the interesting dialogue about the ad hawk and dragon (0)	2025.02.28

lexicon yummy!

'hacking sorcerer' 카테고리의 다른 글

'hacking sorcerer' Related Articles

티스토리툴바