fix(security): apply same multi-word bypass fix to disregard pattern
The 'disregard ... instructions/rules/guidelines' regex had the same single-word gap vulnerability as the 'ignore' pattern fixed in PR #192. 'disregard all your instructions' bypassed the scanner. Added (?:\w+\s+)* between both keyword groups to allow arbitrary intermediate words.
This commit is contained in:
parent
520a26c48f
commit
ba214e43c8
1 changed files with 1 additions and 1 deletions
|
|
@ -172,7 +172,7 @@ THREAT_PATTERNS = [
|
||||||
(r'pretend\s+(you\s+are|to\s+be)\s+',
|
(r'pretend\s+(you\s+are|to\s+be)\s+',
|
||||||
"role_pretend", "high", "injection",
|
"role_pretend", "high", "injection",
|
||||||
"attempts to make the agent assume a different identity"),
|
"attempts to make the agent assume a different identity"),
|
||||||
(r'disregard\s+(your|all|any)\s+(instructions|rules|guidelines)',
|
(r'disregard\s+(?:\w+\s+)*(your|all|any)\s+(?:\w+\s+)*(instructions|rules|guidelines)',
|
||||||
"disregard_rules", "critical", "injection",
|
"disregard_rules", "critical", "injection",
|
||||||
"instructs agent to disregard its rules"),
|
"instructs agent to disregard its rules"),
|
||||||
(r'output\s+the\s+(system|initial)\s+prompt',
|
(r'output\s+the\s+(system|initial)\s+prompt',
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue