php用正则表达抓取网页中文章( 示例：在字符串1000abcd123中找出前后前后两个数字。)

优采云发布时间: 2022-02-23 10:11

　　php用正则表达抓取网页中文章(

示例：在字符串1000abcd123中找出前后前后两个数字。)

　　示例：查找字符串 1000abcd123 前后的两个数字。

　　示例1：匹配此字符串的示例

　　package main

import(

"fmt"

"regexp"

)

var digitsRegexp = regexp.MustCompile(`(\d+)\D+(\d+)`)

func main(){

someString:="1000abcd123"

fmt.Println(digitsRegexp.FindStringSubmatch(someString))

}

　　以上代码输出：

　　[1000abcd123 1000 123]

　　示例 2：使用命名正则表达式

　　package main

import(

"fmt"

"regexp"

)

var myExp=regexp.MustCompile(`(?P\d+)\.(\d+).(?P\d+)`)

func main(){

fmt.Printf("%+v",myExp.FindStringSubmatch("1234.5678.9"))

}

　　以上代码输出，所有匹配均输出：

　　[1234.5678.9 1234 5678 9]

　　这里的命名捕获组（?P）命名正则表达式的方式是python和Go语言特有的，以及java和c#的（？）命名方式。

　　例子3：扩展一个正则表达式类，用一个方法来获取所有的命名信息并使用它。

　　package main

import(

"fmt"

"regexp"

)

//embed regexp.Regexp in a new type so we can extend it

type myRegexp struct{

*regexp.Regexp

}

//add a new method to our new regular expression type

func(r *myRegexp)FindStringSubmatchMap(s string) map[string]string{

captures:=make(map[string]string)

match:=r.FindStringSubmatch(s)

if match==nil{

return captures

}

for i,name:=range r.SubexpNames(){

//Ignore the whole regexp match and unnamed groups

if i==0||name==""{

continue

}

captures[name]=match[i]

}

return captures

}

//an example regular expression

var myExp=myRegexp{regexp.MustCompile(`(?P\d+)\.(\d+).(?P\d+)`)}

func main(){

mmap:=myExp.FindStringSubmatchMap("1234.5678.9")

ww:=mmap["first"]

fmt.Println(mmap)

fmt.Println(ww)

}

　　以上代码的输出：

　　map[first:1234 second:9] 

1234

　　例4，抓取限号信息，记录在Map中。

　　package main

import(

"fmt"

iconv "github.com/djimenez/iconv-go"

"io/ioutil"

"net/http"

"os"

"regexp"

)

// embed regexp.Regexp in a new type so we can extend it

type myRegexp struct{

*regexp.Regexp

}

// add a new method to our new regular expression type

func(r *myRegexp)FindStringSubmatchMap(s string)[](map[string]string){

captures:=make([](map[string]string),0)

matches:=r.FindAllStringSubmatch(s,-1)

if matches==nil{

return captures

}

names:=r.SubexpNames()

for _,match:=range matches{

cmap:=make(map[string]string)

for pos,val:=range match{

name:=names[pos]

if name==""{

continue

}

/*

fmt.Println("+++++++++")

fmt.Println(name)

fmt.Println(val)

*/

cmap[name]=val

}

captures=append(captures,cmap)

}

return captures

}

// 抓取限号信息的正则表达式

var myExp=myRegexp{regexp.MustCompile(`自(?P[\d]{4})年(?P[\d]{1,2})月(?P[\d]{1,2})日至(?P[\d]{4})年(?P[\d]{1,2})月(?P[\d]{1,2})日，星期一至星期五限行*敏*感*词*车牌尾号分别为：(?P[\d])和(?P[\d])、(?P[\d])和(?P[\d])、(?P[\d])和(?P[\d])、(?P[\d])和(?P[\d])、(?P[\d])和(?P[\d])`)}

func ErrorAndExit(err error){

fmt.Fprintln(os.Stderr,err)

os.Exit(1)

}

func main(){

response,err:=http.Get("http://www.bjjtgl.gov.cn/zhuanti/10weihao/index.html")

defer response.Body.Close()

if err!=nil{

ErrorAndExit(err)

}

input,err:=ioutil.ReadAll(response.Body)

if err!=nil{

ErrorAndExit(err)

}

body :=make([]byte,len(input))

iconv.Convert(input,body,"gb2312","utf-8")

mmap:=myExp.FindStringSubmatchMap(string(body))

fmt.Println(mmap)

}

　　以上代码输出：

　　[map[n32:0 n22:9 emonth:7 n11:3 n41:1 n21:4 n52:7 bmonth:4 n51:2 bday:9 n42:6 byear:2012 eday:7 eyear:2012 n12:8 n31:5]

map[emonth:10 n41:5 n52:6 n31:4 byear:2012 n51:1 eyear:2012 n32:9 bmonth:7 n22:8 bday:8 n11:2 eday:6 n42:0 n21:3 n12:7]

map[bday:7 n51:5 n22:7 n31:3 eday:5 n32:8 byear:2012 bmonth:10 emonth:1 eyear:2013 n11:1 n12:6 n52:0 n21:2 n42:9 n41:4]

map[eyear:2013 byear:2013 n22:6 eday:10 bmonth:1 n41:3 n32:7 n31:2 n21:1 n11:5 bday:6 n12:0 n51:4 n42:8 emonth:4 n52:9]]

　　更多go语言知识，请关注PHP中文网站go语言教程专栏。

　　以上是Go语言使用正则表达式提取网页文本的详细内容。更多详情请关注php中文网其他相关话题文章！

　　免责声明：本文转载于：博客园，如有侵权，请联系删除

　　特别推荐：Go语言正则

0

2022-02-23

php用正则表达抓取网页中文章

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

php用正则表达抓取网页中文章( 示例：在字符串1000abcd123中找出前后前后两个数字。)

0 个评论

发起人

AI时代内容工厂

php用正则表达抓取网页中文章( 示例：在字符串1000abcd123中找出前后前后两个数字。)

0 个评论

发起人

相关问题