| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384 |
- package main
- import (
- "bufio"
- "encoding/json"
- "flag"
- "fmt"
- "io"
- "log"
- "net/http"
- "os"
- "regexp"
- "strings"
- "github.com/TecharoHQ/anubis/lib/config"
- "sigs.k8s.io/yaml"
- )
- var (
- inputFile = flag.String("input", "", "path to robots.txt file (use - for stdin)")
- outputFile = flag.String("output", "", "output file path (use - for stdout, defaults to stdout)")
- outputFormat = flag.String("format", "yaml", "output format: yaml or json")
- baseAction = flag.String("action", "CHALLENGE", "default action for disallowed paths: ALLOW, DENY, CHALLENGE, WEIGH")
- crawlDelay = flag.Int("crawl-delay-weight", 0, "if > 0, add weight adjustment for crawl-delay (difficulty adjustment)")
- policyName = flag.String("name", "robots-txt-policy", "name for the generated policy")
- userAgentDeny = flag.String("deny-user-agents", "DENY", "action for specifically blocked user agents: DENY, CHALLENGE")
- helpFlag = flag.Bool("help", false, "show help")
- )
- type RobotsRule struct {
- UserAgents []string
- Disallows []string
- Allows []string
- CrawlDelay int
- IsBlacklist bool // true if this is a specifically denied user agent
- }
- type AnubisRule struct {
- Expression *config.ExpressionOrList `yaml:"expression,omitempty" json:"expression,omitempty"`
- Challenge *config.ChallengeRules `yaml:"challenge,omitempty" json:"challenge,omitempty"`
- Weight *config.Weight `yaml:"weight,omitempty" json:"weight,omitempty"`
- Name string `yaml:"name" json:"name"`
- Action string `yaml:"action" json:"action"`
- }
- func init() {
- flag.Usage = func() {
- fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
- fmt.Fprintf(os.Stderr, "%s [options] -input <robots.txt>\n\n", os.Args[0])
- flag.PrintDefaults()
- fmt.Fprintln(os.Stderr, "\nExamples:")
- fmt.Fprintln(os.Stderr, " # Convert local robots.txt file")
- fmt.Fprintln(os.Stderr, " robots2policy -input robots.txt -output policy.yaml")
- fmt.Fprintln(os.Stderr, "")
- fmt.Fprintln(os.Stderr, " # Convert from URL")
- fmt.Fprintln(os.Stderr, " robots2policy -input https://example.com/robots.txt -format json")
- fmt.Fprintln(os.Stderr, "")
- fmt.Fprintln(os.Stderr, " # Read from stdin, write to stdout")
- fmt.Fprintln(os.Stderr, " curl https://example.com/robots.txt | robots2policy -input -")
- os.Exit(2)
- }
- }
- func main() {
- flag.Parse()
- if len(flag.Args()) > 0 || *helpFlag || *inputFile == "" {
- flag.Usage()
- }
- // Read robots.txt
- var input io.Reader
- if *inputFile == "-" {
- input = os.Stdin
- } else if strings.HasPrefix(*inputFile, "http://") || strings.HasPrefix(*inputFile, "https://") {
- resp, err := http.Get(*inputFile)
- if err != nil {
- log.Fatalf("failed to fetch robots.txt from URL: %v", err)
- }
- defer resp.Body.Close()
- input = resp.Body
- } else {
- file, err := os.Open(*inputFile)
- if err != nil {
- log.Fatalf("failed to open input file: %v", err)
- }
- defer file.Close()
- input = file
- }
- // Parse robots.txt
- rules, err := parseRobotsTxt(input)
- if err != nil {
- log.Fatalf("failed to parse robots.txt: %v", err)
- }
- // Convert to Anubis rules
- anubisRules := convertToAnubisRules(rules)
- // Check if any rules were generated
- if len(anubisRules) == 0 {
- log.Fatal("no valid rules generated from robots.txt - file may be empty or contain no disallow directives")
- }
- // Generate output
- var output []byte
- switch strings.ToLower(*outputFormat) {
- case "yaml":
- output, err = yaml.Marshal(anubisRules)
- case "json":
- output, err = json.MarshalIndent(anubisRules, "", " ")
- default:
- log.Fatalf("unsupported output format: %s (use yaml or json)", *outputFormat)
- }
- if err != nil {
- log.Fatalf("failed to marshal output: %v", err)
- }
- // Write output
- if *outputFile == "" || *outputFile == "-" {
- fmt.Print(string(output))
- } else {
- err = os.WriteFile(*outputFile, output, 0644)
- if err != nil {
- log.Fatalf("failed to write output file: %v", err)
- }
- fmt.Printf("Generated Anubis policy written to %s\n", *outputFile)
- }
- }
- func createRuleFromAccumulated(userAgents, disallows, allows []string, crawlDelay int) RobotsRule {
- rule := RobotsRule{
- UserAgents: make([]string, len(userAgents)),
- Disallows: make([]string, len(disallows)),
- Allows: make([]string, len(allows)),
- CrawlDelay: crawlDelay,
- }
- copy(rule.UserAgents, userAgents)
- copy(rule.Disallows, disallows)
- copy(rule.Allows, allows)
- return rule
- }
- func parseRobotsTxt(input io.Reader) ([]RobotsRule, error) {
- scanner := bufio.NewScanner(input)
- var rules []RobotsRule
- var currentUserAgents []string
- var currentDisallows []string
- var currentAllows []string
- var currentCrawlDelay int
- for scanner.Scan() {
- line := strings.TrimSpace(scanner.Text())
- // Skip empty lines and comments
- if line == "" || strings.HasPrefix(line, "#") {
- continue
- }
- // Split on first colon
- parts := strings.SplitN(line, ":", 2)
- if len(parts) != 2 {
- continue
- }
- directive := strings.TrimSpace(strings.ToLower(parts[0]))
- value := strings.TrimSpace(parts[1])
- switch directive {
- case "user-agent":
- // If we have accumulated rules with directives and encounter a new user-agent,
- // flush the current rules
- if len(currentUserAgents) > 0 && (len(currentDisallows) > 0 || len(currentAllows) > 0 || currentCrawlDelay > 0) {
- rule := createRuleFromAccumulated(currentUserAgents, currentDisallows, currentAllows, currentCrawlDelay)
- rules = append(rules, rule)
- // Reset for next group
- currentUserAgents = nil
- currentDisallows = nil
- currentAllows = nil
- currentCrawlDelay = 0
- }
- currentUserAgents = append(currentUserAgents, value)
- case "disallow":
- if len(currentUserAgents) > 0 && value != "" {
- currentDisallows = append(currentDisallows, value)
- }
- case "allow":
- if len(currentUserAgents) > 0 && value != "" {
- currentAllows = append(currentAllows, value)
- }
- case "crawl-delay":
- if len(currentUserAgents) > 0 {
- if delay, err := parseIntSafe(value); err == nil {
- currentCrawlDelay = delay
- }
- }
- }
- }
- // Don't forget the last group of rules
- if len(currentUserAgents) > 0 {
- rule := createRuleFromAccumulated(currentUserAgents, currentDisallows, currentAllows, currentCrawlDelay)
- rules = append(rules, rule)
- }
- // Mark blacklisted user agents (those with "Disallow: /")
- for i := range rules {
- for _, disallow := range rules[i].Disallows {
- if disallow == "/" {
- rules[i].IsBlacklist = true
- break
- }
- }
- }
- return rules, scanner.Err()
- }
- func parseIntSafe(s string) (int, error) {
- var result int
- _, err := fmt.Sscanf(s, "%d", &result)
- return result, err
- }
- func convertToAnubisRules(robotsRules []RobotsRule) []AnubisRule {
- var anubisRules []AnubisRule
- ruleCounter := 0
- // Process each robots rule individually
- for _, robotsRule := range robotsRules {
- userAgents := robotsRule.UserAgents
- // Handle crawl delay
- if robotsRule.CrawlDelay > 0 && *crawlDelay > 0 {
- ruleCounter++
- rule := AnubisRule{
- Name: fmt.Sprintf("%s-crawl-delay-%d", *policyName, ruleCounter),
- Action: "WEIGH",
- Weight: &config.Weight{Adjust: *crawlDelay},
- }
- if len(userAgents) == 1 && userAgents[0] == "*" {
- rule.Expression = &config.ExpressionOrList{
- All: []string{"true"}, // Always applies
- }
- } else if len(userAgents) == 1 {
- rule.Expression = &config.ExpressionOrList{
- All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgents[0])},
- }
- } else {
- // Multiple user agents - use any block
- var expressions []string
- for _, ua := range userAgents {
- if ua == "*" {
- expressions = append(expressions, "true")
- } else {
- expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
- }
- }
- rule.Expression = &config.ExpressionOrList{
- Any: expressions,
- }
- }
- anubisRules = append(anubisRules, rule)
- }
- // Handle blacklisted user agents
- if robotsRule.IsBlacklist {
- ruleCounter++
- rule := AnubisRule{
- Name: fmt.Sprintf("%s-blacklist-%d", *policyName, ruleCounter),
- Action: *userAgentDeny,
- }
- if len(userAgents) == 1 {
- userAgent := userAgents[0]
- if userAgent == "*" {
- // This would block everything - convert to a weight adjustment instead
- rule.Name = fmt.Sprintf("%s-global-restriction-%d", *policyName, ruleCounter)
- rule.Action = "WEIGH"
- rule.Weight = &config.Weight{Adjust: 20} // Increase difficulty significantly
- rule.Expression = &config.ExpressionOrList{
- All: []string{"true"}, // Always applies
- }
- } else {
- rule.Expression = &config.ExpressionOrList{
- All: []string{fmt.Sprintf("userAgent.contains(%q)", userAgent)},
- }
- }
- } else {
- // Multiple user agents - use any block
- var expressions []string
- for _, ua := range userAgents {
- if ua == "*" {
- expressions = append(expressions, "true")
- } else {
- expressions = append(expressions, fmt.Sprintf("userAgent.contains(%q)", ua))
- }
- }
- rule.Expression = &config.ExpressionOrList{
- Any: expressions,
- }
- }
- anubisRules = append(anubisRules, rule)
- }
- // Handle specific disallow rules
- for _, disallow := range robotsRule.Disallows {
- if disallow == "/" {
- continue // Already handled as blacklist above
- }
- ruleCounter++
- rule := AnubisRule{
- Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
- Action: *baseAction,
- }
- // Build CEL expression
- var conditions []string
- // Add user agent conditions
- if len(userAgents) == 1 && userAgents[0] == "*" {
- // Wildcard user agent - no user agent condition needed
- } else if len(userAgents) == 1 {
- conditions = append(conditions, fmt.Sprintf("userAgent.contains(%q)", userAgents[0]))
- } else {
- // For multiple user agents, we need to use a more complex expression
- // This is a limitation - we can't easily combine any for user agents with all for path
- // So we'll create separate rules for each user agent
- for _, ua := range userAgents {
- if ua == "*" {
- continue // Skip wildcard as it's handled separately
- }
- ruleCounter++
- subRule := AnubisRule{
- Name: fmt.Sprintf("%s-disallow-%d", *policyName, ruleCounter),
- Action: *baseAction,
- Expression: &config.ExpressionOrList{
- All: []string{
- fmt.Sprintf("userAgent.contains(%q)", ua),
- buildPathCondition(disallow),
- },
- },
- }
- anubisRules = append(anubisRules, subRule)
- }
- continue
- }
- // Add path condition
- pathCondition := buildPathCondition(disallow)
- conditions = append(conditions, pathCondition)
- rule.Expression = &config.ExpressionOrList{
- All: conditions,
- }
- anubisRules = append(anubisRules, rule)
- }
- }
- return anubisRules
- }
- func buildPathCondition(robotsPath string) string {
- // Handle wildcards in robots.txt paths
- if strings.Contains(robotsPath, "*") || strings.Contains(robotsPath, "?") {
- // Convert robots.txt wildcards to regex
- regex := regexp.QuoteMeta(robotsPath)
- regex = strings.ReplaceAll(regex, `\*`, `.*`) // * becomes .*
- regex = strings.ReplaceAll(regex, `\?`, `.`) // ? becomes .
- regex = "^" + regex
- return fmt.Sprintf("path.matches(%q)", regex)
- }
- // Simple prefix match for most cases
- return fmt.Sprintf("path.startsWith(%q)", robotsPath)
- }
|