多测师是一家拥有先进的教学理念,强大的师资团队,业内好评甚多的接口自动化测试培训机构!

17727591462

联系电话

您现在所在位置:接口自动化测试培训 > 新闻资讯

Selenium&EmguCV实现爬虫图片识别-自动化测试

更新时间:2022-04-29 09:24:42 作者:多测师 浏览:278

  概述

  爬虫需要抓取网站价格,与一般抓取网页区别的是抓取内容是通过AJAX加载,并且价格是通过CSS背景图片显示的。

  数字对应的样式和对应的backgroundimg都是动态改变的,需要获取到每一个房型的房价。虽然后来有了其它渠道获取房价,这里记录一下用Selenium&Emgu抓取的方式。

Selenium&EmguCV实现爬虫图片识别-自动化测试

  流程:

  1.Selenium访问网址

  2.全屏截图

  3.Selenium选择器获取房型等信息

  4.Selenium选择器获取价格DOM元素,计算出价格元素的相对位置,截取价格图片,使用Emgu识别价格并且输出

  实现

  ```C#

  static void Main(string[] args)

  {

  //访问网址

  ChromeOptions options = new ChromeOptions();

  options.AddArguments("--start-maximized --disable-popup-blocking");

  var driver = new ChromeDriver(options);

  driver.Navigate().GoToUrl("http://hotels.ctrip.com/hotel/992765.html");

  try

  {

  new WebDriverWait(driver, TimeSpan.FromSeconds(1)).Until(

  ExpectedConditions.ElementExists((By.ClassName("htl_room_table")))); //表示已加载完毕

  }

  finally

  {

  }

  //删除价格的¥符号

  ReadOnlyCollection elementsList = driver.FindElementsByCssSelector("tr[expand]");

  driver.ExecuteScript(@"

  var arr = document.getElementsByTagName('dfn');

  for(var i=0;i

  arr[i].style.display = 'none';

  }

  ");

  //全屏截图

  var image2 = GetEntereScreenshot(driver);

  image2.Save(@"Z:\111.jpg");

  //输出

  Console.WriteLine("{0,-20}{1,-20}{2,-20}", "房型", "类型", "房价");

  foreach (IWebElement _ in elementsList)

  {

  //var image = _.Snapshot();

  //image.Save(@"Z:\" + Guid.NewGuid() + ".jpg");

  //var str = ORC_((Bitmap)image);

  var roomType = "";

  try

  {

  roomType = _.FindElement(By.CssSelector(".room_unfold")).Text;

  }

  catch (Exception)

  {

  }

  var roomTypeText = regRoomType.Match(roomType);

  var roomTypeName = _.FindElement(By.CssSelector("span.room_type_name")).Text;

  //价格元素生成图片

  var image = _.FindElement(By.CssSelector("span.base_price")).SnapshotV2(image2);

  //识别

  var price = ORC_((Bitmap)image);

  Console.WriteLine("{0,-20}{1,-20}{2,-20}", roomTypeText.Value, roomTypeName, price);

  }

  Console.Read();

  }

  ```

  图片识别方法

  ```C#

  static Program()

  {

  ocr.SetVariable("tesseditchar_whitelist", "0123456789");

  }

  private static Tesseract _ocr = new Tesseract(@"C:\Emgu\emgucv-windows-universal-cuda 2.9.0.1922\bin\tessdata", "eng", Tesseract.OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED);

  //传入图片进行识别

  public static string ORC_(Bitmap img)

  {

  //""标示OCR识别调用失败

  string re = "";

  if (img == null)

  return re;

  else

  {

  Bgr drawColor = new Bgr(Color.Blue);

  try

  {

  Image image = new Image(img);

  using (Image gray = image.Convert())

  {

  _ocr.Recognize(gray);

  Tesseract.Charactor[] charactors = _ocr.GetCharactors();

  foreach (Tesseract.Charactor c in charactors)

  {

  image.Draw(c.Region, drawColor, 1);

  }

  re = _ocr.GetText();

  }

  return re;

  }

  catch (Exception ex)

  {

  return re;

  }

  }

  }

  ```

  Selenium内置了截图方法,只能截取浏览器中显示的内容,找到一个全屏截图的方式(内置截图+控制滚动条,图片拼接)

  ```C#

  public static Bitmap GetEntereScreenshot(IWebDriver _driver)

  {

  Bitmap stitchedImage = null;

  try

  {

  long totalwidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.offsetWidth");//documentElement.scrollWidth");

  long totalHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.parentNode.scrollHeight");

  int totalWidth = (int)totalwidth1;

  int totalHeight = (int)totalHeight1;

  // Get the Size of the Viewport

  long viewportWidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.clientWidth");//documentElement.scrollWidth");

  long viewportHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return window.innerHeight");//documentElement.scrollWidth");

  int viewportWidth = (int)viewportWidth1;

  int viewportHeight = (int)viewportHeight1;

  // Split the Screen in multiple Rectangles

  List rectangles = new List();

  // Loop until the Total Height is reached

  for (int i = 0; i < totalHeight; i += viewportHeight)

  {

  int newHeight = viewportHeight;

  // Fix if the Height of the Element is too big

  if (i + viewportHeight > totalHeight)

  {

  newHeight = totalHeight - i;

  }

  // Loop until the Total Width is reached

  for (int ii = 0; ii < totalWidth; ii += viewportWidth)

  {

  int newWidth = viewportWidth;

  // Fix if the Width of the Element is too big

  if (ii + viewportWidth > totalWidth)

  {

  newWidth = totalWidth - ii;

  }

  // Create and add the Rectangle

  Rectangle currRect = new Rectangle(ii, i, newWidth, newHeight);

  rectangles.Add(currRect);

  }

  }

  // Build the Image

  stitchedImage = new Bitmap(totalWidth, totalHeight);

  // Get all Screenshots and stitch them together

  Rectangle previous = Rectangle.Empty;

  foreach (var rectangle in rectangles)

  {

  // Calculate the Scrolling (if needed)

  if (previous != Rectangle.Empty)

  {

  int xDiff = rectangle.Right - previous.Right;

  int yDiff = rectangle.Bottom - previous.Bottom;

  // Scroll

  //selenium.RunScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff));

  ((IJavaScriptExecutor)_driver).ExecuteScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff));

  System.Threading.Thread.Sleep(200);

  }

  // Take Screenshot

  var screenshot = ((ITakesScreenshot)_driver).GetScreenshot();

  // Build an Image out of the Screenshot

  Image screenshotImage;

  using (MemoryStream memStream = new MemoryStream(screenshot.AsByteArray))

  {

  screenshotImage = Image.FromStream(memStream);

  }

  // Calculate the Source Rectangle

  Rectangle sourceRectangle = new Rectangle(viewportWidth - rectangle.Width, viewportHeight - rectangle.Height, rectangle.Width, rectangle.Height);

  // Copy the Image

  using (Graphics g = Graphics.FromImage(stitchedImage))

  {

  g.DrawImage(screenshotImage, rectangle, sourceRectangle, GraphicsUnit.Pixel);

  }

  // Set the Previous Rectangle

  previous = rectangle;

  }

  }

  catch (Exception ex)

  {

  // handle

  }

  return stitchedImage;

  }

  ```

  最后的是根据传入的元素和全屏截图,获取到价格元素的图片

  ```C#

  public static Image SnapshotV2(this IWebElement element, Bitmap bitmap)

  {

  Size size = new Size(

  Math.Min(element.Size.Width, bitmap.Width),

  Math.Min(element.Size.Height, bitmap.Height));

  Rectangle crop = new Rectangle(element.Location, size);

  return bitmap.Clone(crop, bitmap.PixelFormat);

  }

  ```

  以上内容为大家介绍了自动化测试中的Selenium&EmguCV实现爬虫图片识别,本文由多测师亲自撰写,希望对大家有所帮助。了解更多自动化测试相关知识:https://www.aichudan.com/xwzx/

联系电话

17727591462

返回顶部